Week 2 Milestone Report

Basic summary of the data

First, I load the three data sets: Blogs, News and Twitter and count the number of lines and the number of words in each.

if (!file.exists('capstoneDataset')){
        url = 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'
        download.file(url = url, destfile = 'capstoneDataSet')
        unzip('capstoneDataSet')     
}
# Number of lines
files <-dir('./data/final/en_US', full.names = T)
blogs <- readLines(files[1], warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(files[2], warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(files[3], warn = FALSE, encoding = "UTF-8", skipNul = TRUE)

dat <-list(blogs, news, twitter)
names(dat) = c('Blogs', 'News', 'Twitter')

# The number of lines in each data set
sapply(dat, length)

##   Blogs    News Twitter 
##  899288 1010242 2360148

# The number of words in each data set
sapply(dat, function(x) sum(stringr::str_count(x)))

##     Blogs      News   Twitter 
## 206824257 203223153 162095975

Exploratory analysis

Due to the limited computer power, I take 1% of the data for exploratory analysis.

set.seed(12345)
sblogs <- sample(blogs, length(blogs)*.01, replace = F)
snews <- sample(news, length(news)*.01, replace = F)
stwitter <- sample(twitter, length(twitter)*.01, replace = F)

dat <-c(sblogs, snews, stwitter)

Next, I do some basic data cleaning to keep only words.

# Remove lines with unknown characters
NotKnown <- grep("NotKnown", iconv(dat, "latin1", "ASCII", sub="NotKnown"))
dat <- dat[-NotKnown]
# Do simple cleaning
dat <- gsub("&amp", "", dat)
dat <- gsub("RT :|@[a-z,A-Z]*: ", "", dat) # remove tweets
dat <- gsub("@\\w+", "", dat)
dat <- gsub("[[:digit:]]", "", dat) # remove digits
dat <- gsub(" #\\S*","", dat)  # remove hash tags 
dat <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", dat) # remove url
dat <- qdapRegex::rm_white(dat) # remove extra spaces
df <- dplyr::tibble(line = 1:length(dat), text = dat)

I next conduct some preliminary analysis on the clean data set.

Unigram

This section displays the most frequent unigrams in the sampled data, excluding stop words

library(tidyverse)
library(tidytext)
data("stop_words")
unigram <- df%>%unnest_tokens(word,text)%>%anti_join(stop_words)%>%count(word, sort = T)%>%
        filter(n>500)%>%mutate(word = reorder(word,n))

ggplot(unigram,aes(n,word))+geom_col()+labs(y=NULL, x = 'Frequency', title = 'Most frequent unigrams')

Bigram

Next, I provide the most frequent bigrams in the sampled data, excluding stop words

bigram <- df%>%unnest_tokens(bigram, text, token = 'ngrams', n=2)%>%
        separate(bigram,c('word1','word2'), sep = ' ', extra = 'drop', fill = 'right')%>%
        filter(!word1 %in% stop_words$word)%>%filter(!word2 %in% stop_words$word)%>%
        unite(bigram, word1, word2, sep = ' ')%>%filter(!(bigram=='NA NA'))%>%
        count(bigram, sort = TRUE)%>%filter(n>25)%>%mutate(bigram=reorder(bigram,n))
ggplot(bigram,aes(n,bigram))+geom_col()+labs(y=NULL, x = 'Frequency', title = 'Most frequent bigrams')

Trigram

Finally, I graph the most frequent trigrams in the sampled data.

trigram <- df%>%unnest_tokens(trigram, text, token = 'ngrams', n=3)%>%
        separate(trigram,c('word1','word2','word3'), sep = ' ', extra = 'drop', fill = 'right')%>%
        filter(!word1 %in% stop_words$word)%>%
        filter(!word2 %in% stop_words$word)%>%
        filter(!word3 %in% stop_words$word)%>%
        unite(trigram, word1, word2, word3, sep = ' ')%>%
        filter(!(trigram=='NA NA NA'))%>%
        count(trigram, sort = TRUE)%>%
        filter(n>6)%>%mutate(trigram=reorder(trigram,n))
ggplot(trigram,aes(n,trigram))+geom_col()+labs(y=NULL, x = 'Frequency', title = 'Most frequent trigrams')

My next step is to develop the model and use the data to train the model for forecasting purposes. At this point, I think my Shiny app would have input text box and an output text box which renders the forecast along with the input.