The ability to predict the next word a user will type is valuable especially on the web and mobile devices. The goal of the Capstone project is to build a model to predict the next word a user will type. The goal of this Milestone Report is to demonstrate the results of an exploratory data analysis and describe the ideas we intend to develop in order to complete the project.
setwd("C:/Users/brad.dietz/Desktop/AMN/R/Examples/DS/Capstone/data/en_US")
blogs <- readLines("en_US.blogs.txt", encoding="latin1", warn = FALSE)
news <- readLines("en_US.news.txt", encoding="latin1", warn = FALSE)
twitter <- readLines("en_US.twitter.txt", encoding="latin1", warn = FALSE)
library(stringr)
blogs_size <- file.size("en_US.blogs.txt")/1024^2
news_size <- file.size("en_US.news.txt")/1024^2
twitter_size <- file.size("en_US.twitter.txt")/1024^2
blogs_lines <- length(blogs)
news_lines <- length(news)
twitter_lines <- length(twitter)
blogs_count <- str_count(blogs, "\\S+")
news_count <- str_count(news, "\\S+")
twitter_count <- str_count(twitter, "\\S+")
rm(blogs)
rm(news)
rm(twitter)
dataSummary <- data.frame(
FileName = c("Blogs","News","Twitter"),
FileSize = c(round(blogs_size, digits = 2),
round(news_size,digits = 2),
round(twitter_size, digits = 2)),
LineCount = c(blogs_lines,news_lines,twitter_lines),
WordCount = c(sum(blogs_count),sum(news_count),sum(twitter_count)),
MaxWords = c(max(blogs_count),max(news_count),max(twitter_count)),
MinWords = c(min(blogs_count),min(news_count),min(twitter_count)),
MeanWords = c(round(mean(blogs_count), digits = 2),round(mean(news_count), digits = 2),round(mean(twitter_count), digits = 2))
)
dataSummary
## FileName FileSize LineCount WordCount MaxWords MinWords MeanWords
## 1 Blogs 200.42 899288 37334690 6630 1 41.52
## 2 News 196.28 77259 2643979 1031 1 34.22
## 3 Twitter 159.36 2360148 30374166 47 1 12.87
100% of the data has been processed for the Milestone report. It was necessary to reduce the size of the datasets as the function to create N-Grams crashes with large datasets and the model will ultimately be on the web. The scan function was used so only a portion of the dataset was ran through the N-Gram function. N-Grams when the count equaled 1 were excluded due to the resulting large files and the inability to use them on the Shiny app. Posting the entire code is not feasible so the next section qualitatively describes the process.
The following steps were taken on the three datasets (Blogs, Twitter, and News):
setwd("C:/Users/brad.dietz/Desktop/AMN/R/Examples/DS/Capstone/data/en_US/Aggregate/Gr2/Processed")
unigrams <- read.csv(file= "unigramsfinal.csv", header = T)
unigrams <- unigrams[, c(2,3)]
unigrams<-unigrams[order(-unigrams$SumNF),][1:20, ]
unigrams$unigrams <- as.character(unigrams$Splittext)
library(ggplot2)
ggplot(data = unigrams, aes(x = reorder(Splittext, -SumNF), y = SumNF)) +
geom_bar(stat = 'identity') +
ggtitle('Most frequent Unigrams\n') +
theme(plot.title = element_text(size = rel(1.5)), axis.text.x = element_text(angle = 45, face = 'bold', hjust = 1))+
xlab('Unigram') +
ylab('Count') +
scale_y_continuous(labels = scales::comma)
library(wordcloud)
setwd("C:/Users/brad.dietz/Desktop/AMN/R/Examples/DS/Capstone/data/en_US/Aggregate/Gr2/Processed")
bigrams <- read.csv(file= "bigramsfinal.csv", header = T)
bigrams <- bigrams[, c(2,3)]
wordcloud(bigrams[,1], bigrams[,2], scale=c(5,0.5), max.words=100,
random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(12, "Paired"))
bigrams<-bigrams[order(-bigrams$SumNF),][1:20, ]
library(ggplot2)
bigrams$bigrams <- as.character(bigrams$bigrams)
ggplot(data = bigrams, aes(x = reorder(bigrams, -SumNF), y = SumNF)) +
geom_bar(stat = 'identity') +
ggtitle('Most frequent Bigrams\n') +
theme(plot.title = element_text(size = rel(1.5)), axis.text.x = element_text(angle = 45, face = 'bold', hjust = 1))+
xlab('Bigram') +
ylab('Count') +
scale_y_continuous(labels = scales::comma)
library(wordcloud)
setwd("C:/Users/brad.dietz/Desktop/AMN/R/Examples/DS/Capstone/data/en_US/Aggregate/Gr2/Processed")
trigrams <- read.csv(file= "trigramsfinal.csv", header = T)
trigrams <- trigrams[, c(2,3)]
wordcloud(trigrams[,1], trigrams[,2], min.freq=6500, scale = c(4, 0.1),
random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(12, "Paired"))
trigrams<-trigrams[order(-trigrams$SumNF),][1:20, ]
library(ggplot2)
trigrams$trigrams <- as.character(trigrams$trigrams)
ggplot(data = trigrams, aes(x = reorder(trigrams, -SumNF), y = SumNF)) +
geom_bar(stat = 'identity') +
ggtitle('Most frequent Trigrams\n') +
theme(plot.title = element_text(size = rel(1.5)), axis.text.x = element_text(angle = 45, face = 'bold', hjust = 1))+
xlab('Trigram') +
ylab('Count') +
scale_y_continuous(labels = scales::comma)
setwd("C:/Users/brad.dietz/Desktop/AMN/R/Examples/DS/Capstone/data/en_US/Aggregate/Gr2/Processed")
quadgrams <- read.csv(file= "quadgramsfinal.csv", header = T)
quadgrams <- quadgrams[, c(2,3)]
quadgrams<-quadgrams[order(-quadgrams$SumNF),][1:20, ]
quadgrams$quadgrams <- as.character(quadgrams$quadgrams)
library(ggplot2)
ggplot(data = quadgrams, aes(x = reorder(quadgrams, -SumNF), y = SumNF)) +
geom_bar(stat = 'identity') +
ggtitle('Most frequent Quadgrams\n') +
theme(plot.title = element_text(size = rel(1.5)), axis.text.x = element_text(angle = 45, face = 'bold', hjust = 1))+
xlab('Quadgram') +
ylab('Count') +
scale_y_continuous(labels = scales::comma)
Determine the best predictive model given the size and speed constraints of a Shiny application.
The data restrictions of the Shiny app will be curtailed by increasing the minimum N-Gram count. For instance: The Trigram table is 14.9 MB when the minimum count is 2. The Trigram table is 47.5 MB when the minimum count is 1.
I plan to investigate the following models: