First, I load the three data sets: Blogs, News and Twitter and count the number of lines and the number of words in each.
if (!file.exists('capstoneDataset')){
url = 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'
download.file(url = url, destfile = 'capstoneDataSet')
unzip('capstoneDataSet')
}
# Number of lines
files <-dir('./data/final/en_US', full.names = T)
blogs <- readLines(files[1], warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(files[2], warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(files[3], warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
dat <-list(blogs, news, twitter)
names(dat) = c('Blogs', 'News', 'Twitter')
# The number of lines in each data set
sapply(dat, length)
## Blogs News Twitter
## 899288 1010242 2360148
# The number of words in each data set
sapply(dat, function(x) sum(stringr::str_count(x)))
## Blogs News Twitter
## 206824257 203223153 162095975
Due to the limited computer power, I take 1% of the data for exploratory analysis.
set.seed(12345)
sblogs <- sample(blogs, length(blogs)*.01, replace = F)
snews <- sample(news, length(news)*.01, replace = F)
stwitter <- sample(twitter, length(twitter)*.01, replace = F)
dat <-c(sblogs, snews, stwitter)
Next, I do some basic data cleaning to keep only words.
# Remove lines with unknown characters
NotKnown <- grep("NotKnown", iconv(dat, "latin1", "ASCII", sub="NotKnown"))
dat <- dat[-NotKnown]
# Do simple cleaning
dat <- gsub("&", "", dat)
dat <- gsub("RT :|@[a-z,A-Z]*: ", "", dat) # remove tweets
dat <- gsub("@\\w+", "", dat)
dat <- gsub("[[:digit:]]", "", dat) # remove digits
dat <- gsub(" #\\S*","", dat) # remove hash tags
dat <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", dat) # remove url
dat <- qdapRegex::rm_white(dat) # remove extra spaces
df <- dplyr::tibble(line = 1:length(dat), text = dat)
I next conduct some preliminary analysis on the clean data set.
This section displays the most frequent unigrams in the sampled data, excluding stop words
library(tidyverse)
library(tidytext)
data("stop_words")
unigram <- df%>%unnest_tokens(word,text)%>%anti_join(stop_words)%>%count(word, sort = T)%>%
filter(n>500)%>%mutate(word = reorder(word,n))
ggplot(unigram,aes(n,word))+geom_col()+labs(y=NULL, x = 'Frequency', title = 'Most frequent unigrams')
Next, I provide the most frequent bigrams in the sampled data, excluding stop words
bigram <- df%>%unnest_tokens(bigram, text, token = 'ngrams', n=2)%>%
separate(bigram,c('word1','word2'), sep = ' ', extra = 'drop', fill = 'right')%>%
filter(!word1 %in% stop_words$word)%>%filter(!word2 %in% stop_words$word)%>%
unite(bigram, word1, word2, sep = ' ')%>%filter(!(bigram=='NA NA'))%>%
count(bigram, sort = TRUE)%>%filter(n>25)%>%mutate(bigram=reorder(bigram,n))
ggplot(bigram,aes(n,bigram))+geom_col()+labs(y=NULL, x = 'Frequency', title = 'Most frequent bigrams')
Finally, I graph the most frequent trigrams in the sampled data.
trigram <- df%>%unnest_tokens(trigram, text, token = 'ngrams', n=3)%>%
separate(trigram,c('word1','word2','word3'), sep = ' ', extra = 'drop', fill = 'right')%>%
filter(!word1 %in% stop_words$word)%>%
filter(!word2 %in% stop_words$word)%>%
filter(!word3 %in% stop_words$word)%>%
unite(trigram, word1, word2, word3, sep = ' ')%>%
filter(!(trigram=='NA NA NA'))%>%
count(trigram, sort = TRUE)%>%
filter(n>6)%>%mutate(trigram=reorder(trigram,n))
ggplot(trigram,aes(n,trigram))+geom_col()+labs(y=NULL, x = 'Frequency', title = 'Most frequent trigrams')
My next step is to develop the model and use the data to train the model for forecasting purposes. At this point, I think my Shiny app would have input text box and an output text box which renders the forecast along with the input.