The motivation for this project is to:
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(wordcloud)
library(tm)
library(ngram)
library(stringi)
library(data.table)
library(tidyr)
Data was imported from a blog, news, and twitter file. For summary results the entire file was used, but for analysis the first 5k rows from each file was pulled based on system limitations.
# set the sample size
sampleSize <- 5000
# import the data
blog<-file("./final/en_US/en_US.blogs.txt","r")
blogSample<-readLines(blog, n=sampleSize)
close(blog)
blog<-file("./final/en_US/en_US.blogs.txt","r")
blogFull<-readLines(blog)
close(blog)
news<-file("./final/en_US/en_US.news.txt",open="r")
newsSample<-readLines(news, n=sampleSize)
close(news)
news<-file("./final/en_US/en_US.news.txt",open="r")
newsFull<-readLines(news)
close(news)
twitter<-file("./final/en_US/en_US.twitter.txt","r")
twitterSample<-readLines(twitter, n=sampleSize)
close(twitter)
twitter<-file("./final/en_US/en_US.twitter.txt","r")
twitterFull<-readLines(twitter)
close(twitter)
#Paste text together
textSample<-paste(c(blogSample, newsSample, twitterSample))
reviewSource<-VectorSource(textSample)
corpus<-Corpus(reviewSource)
#tidy up the data
corpus<-tm_map(corpus,content_transformer(tolower))
corpus<-tm_map(corpus, removePunctuation)
corpus<-tm_map(corpus, stripWhitespace)
corpus<-tm_map(corpus,removeWords, stopwords("english"))
docMatrix<-DocumentTermMatrix(corpus)
docMatrix<-as.matrix(docMatrix)
freq<-colSums(docMatrix)
freq<-sort(freq, decreasing=TRUE)
str <- concatenate ( lapply ( corpus , "[", 1) )
Overall summary of the full file
Word_Count<-c(wordcount(blogFull),wordcount(newsFull), wordcount(twitterFull))
Row_Count<-sapply(list(blogFull, newsFull, twitterFull), function(x){nrow(data.table(x))})
File_Name<-c("Blog","News","Twitter")
resultsTable<-data.table(cbind(File_Name, Word_Count, Row_Count))
resultsTable
## File_Name Word_Count Row_Count
## 1: Blog 37334131 899288
## 2: News 2643969 77259
## 3: Twitter 30373543 2360148
Below are charts showing key features of the data.
My plan is to put the 2 and 3 word combinations in a table, and create a shiny app to filter the data for the top 5 results based on the words used. If there is not a match for the 3 word combo, then it will default to 2, then default to uni to at least provide something to the user.
Below is a snippet of the results of the uni, 2gram, and 3gram that will be used in the model.
triResults<-separate(triResults,ngrams,c("word1","word2","word3"),sep=" ",remove=FALSE)
biResults<-separate(biResults,ngrams,c("word1","word2"),sep=" ",remove=FALSE)
print(head(triResults[,-7]))
## ngrams word1 word2 word3 freq prop
## 1 first time since first time since 14 5.608727e-05
## 2 new york times new york times 12 4.807480e-05
## 3 pates fountain parks pates fountain parks 11 4.406857e-05
## 4 cant wait see cant wait see 10 4.006234e-05
## 5 happy new year happy new year 9 3.605610e-05
## 6 new york city new york city 9 3.605610e-05
print(head(biResults[,-6]))
## ngrams word1 word2 freq prop
## 1 new york new york 100 0.0004006218
## 2 last year last year 86 0.0003445347
## 3 right now right now 78 0.0003124850
## 4 high school high school 69 0.0002764290
## 5 dont know dont know 59 0.0002363668
## 6 last night last night 59 0.0002363668
print(head(uniResults[,-4]))
## ngrams freq prop
## 1 said 1446 0.005792968
## 2 will 1321 0.005292192
## 3 one 1286 0.005151975
## 4 just 1178 0.004719305
## 5 like 1068 0.004278623
## 6 can 1020 0.004086326
What other options did I miss? What else could I do to make the model more accurate or run faster?
Plot code
# uniGram
pal <-brewer.pal(8,"Dark2")
words<-names(freq)
wordcloud(words[1:100],freq[1:100],colors=pal)
freq<-data.table(cbind.data.frame(names(freq),freq))
names(freq)<-c("Word","Frequency")
test<-freq$Frequency > 200
top5Words<-freq[test]
uniGram<-ngram(str,n=1)
uniResults<-get.phrasetable(uniGram)
uniResults$cumProb<-cumsum(uniResults$prop)
# top 5
top5Uni<-uniResults[1:5,]
# top5Uni<-top5Uni[order(-top5Uni$prop),]
top5Uni<-arrange(top5Uni,desc(prop))
g<-ggplot(data=top5Uni,aes(x=ngrams, y=freq))+geom_bar(stat="identity")+labs(title="Top 5 Words", x="Words", y="Frequency")
print(g)
plot(uniResults$cumProb, main="Coverage of words to probability", xlab="Number of words", ylab="% chance all words are covered") #+labs(title="Top 5 Words", x="Words", y="Frequency")
#biGram
biGram<-ngram(str,n=2)
biResults<-get.phrasetable(biGram)
biResults$cumProb<-cumsum(biResults$prop)
plot(biResults$cumProb, main="Coverage of words to probability", xlab="Number of words", ylab="% chance all words are covered")
#triGram
triGram<-ngram(str,n=3)
triResults<-get.phrasetable(triGram)
triResults$cumProb<-cumsum(triResults$prop)
plot(triResults$cumProb, main="Coverage of words to probability", xlab="Number of words", ylab="% chance all words are covered")