Milestone Report (Week 2)

Loading the Data

The data files have been previously downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip into the project’s working directory and now will be loaded in RStudio environment and merged into one vector :

blogs <- readr::read_lines("Dataset/en_US/en_US.blogs.txt")
news <- readr::read_lines("Dataset/en_US/en_US.news.txt")
twitter <- readr::read_lines("Dataset/en_US/en_US.twitter.txt")
txtdata <- c(blogs, news, twitter)

For future data cleaning, three different dictionaries with profanity words have been downloaded from external sources into the working directory and cleaned:

profanity    <- clean.text(unique(c(readr::read_lines("Dataset/badwords1.txt"), readr::read_lines("Dataset/badwords2.txt"), readr::read_lines("Dataset/badwords3.txt"))))

Data Summary Statistics

The summary statistics for each of the datasets and the combined dataset, including the size of the source text vectors in the memory (Mb), the number of lines of the source text vectors, and the count of the number of words are the following:

data.frame(Dataset = c("Blogs", "News", "Twitter", "JOINED DATASET"),
                     Size_in_Memory_Mb= c(pryr::object_size(blogs)/1024, pryr::object_size(news)/1024, pryr::object_size(twitter)/1024, pryr::object_size(txtdata)/1024),
                     Number_of_Lines = c(length(blogs), length(news), length(twitter), length(txtdata)),
                     Number_of_Words = c(sapply(strsplit(paste(blogs, collapse = " "), " "), length),
                                         sapply(strsplit(paste(news, collapse = " "), " "), length),
                                         sapply(strsplit(paste(twitter, collapse = " "), " "), length),
                                         sapply(strsplit(paste(txtdata, collapse = " "), " "), length))
                     ) %>% knitr::kable()

## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp

Dataset	Size_in_Memory_Mb	Number_of_Lines	Number_of_Words
Blogs	261483.0	899288	37334131
News	263516.6	1010242	34372530
Twitter	326645.2	2360148	30373545
JOINED DATASET	851447.1	4269678	102080206

rm(list = c("blogs", "news", "twitter"))

Pre-Cleaning

The following cleaning functions replaces contractions with long form, remove numbers, remove punctuation, translate characters to lower case, and remove redundant white space:

txtdata <- iconv(txtdata, to = "ASCII", sub = "")              # rm non-ASCII characters
txtdata <- qdap::replace_contraction(txtdata)
txtdata <- tm::removeNumbers(txtdata)
txtdata <- tm::removePunctuation(txtdata)
txtdata <- tolower(txtdata)
txtdata <- tm::stripWhitespace(txtdata)

CReating Corpus

Creating a corpus:

txtdata_corpus <- quanteda::corpus(txtdata)

Summary of types and tokens in the corpus:

s <- summary(quanteda::corpus( paste(txtdata, collapse = " " ) ))
s[c("Types", "Tokens")] %>% knitr::kable(); rm(s)

Types	Tokens
898715	100912493

Constructing a tokens object with word-tokens that consist of two characters or more:

t1 <- quanteda::tokens(txtdata_corpus); t1 <- quanteda::tokens_keep(t1, min_nchar = 2)

Removing common english stopwords and profanity:

t1 <- quanteda::tokens_select(t1, quanteda::stopwords(language="en"), selection = 'remove')            # rm english stopwords
t1 <- quanteda::tokens_select(t1, profanity, selection = 'remove')                                     # rm profanity

N-grams

Determining and plotting top 25 words (unigrams):

qt_dfm1 <- quanteda::dfm(t1)
quanteda::topfeatures(qt_dfm1, 25)

##    can   said   just    one   like    get   time    new   good    now    day 
## 312179 304846 302790 288860 269009 226195 214018 194028 178147 177705 169358 
##     us   know   love people   back     go    see  first   also   make  going 
## 164409 162708 159909 158298 141123 139232 138419 134606 130179 130122 126407 
##  think   last  great 
## 126035 124692 122931

tibble::enframe(quanteda::topfeatures(qt_dfm1, 25), name = "Terms", value = "Counts") %>% 
        ggplot(aes(x=reorder(Terms, Counts), y=Counts)) + geom_bar(stat = "identity") + coord_flip() + xlab("Top 20 Unigrams")

rm(qt_dfm1)

Top 25 bigrams (two-word sequences):

t2 <- quanteda::tokens_ngrams(t1, n=2L, skip = 0L, concatenator = "_")
qt_dfm2 <- quanteda::dfm(t2)
quanteda::topfeatures(qt_dfm2, 25) #head(sort(quanteda::featfreq(qt_dfm1), decreasing = T), n=25)

##       right_now          let_us        new_york       last_year        can_wait 
##           24916           22262           19395           18601           16654 
##      last_night     high_school       years_ago         can_get       feel_like 
##           15397           13918           13713           13629           12940 
##       last_week      first_time looking_forward       make_sure      looks_like 
##           12536           12146           11424           10447            9879 
##        st_louis     even_though  happy_birthday    good_morning      new_jersey 
##            9646            9322            8862            8359            8350 
##        just_got         can_see        let_know   united_states         one_day 
##            8311            8230            8222            7837            7784

tibble::enframe(quanteda::topfeatures(qt_dfm2, 25), name = "Terms", value = "Counts") %>% 
        ggplot(aes(x=reorder(Terms, Counts), y=Counts)) + geom_bar(stat = "identity") + coord_flip() + xlab("Top 25 Bigrams")

rm(t2); rm(qt_dfm2)

Top 25 trigrams (three-word sequences):

t3 <- quanteda::tokens_ngrams(t1, n=3L, skip = 0L, concatenator = "_")

qt_dfm3 <- quanteda::dfm(t3)
quanteda::topfeatures(qt_dfm3, 25)

##      happy_mothers_day              let_us_go           can_wait_see 
##                   3419                   3283                   3158 
##          new_york_city            let_us_know         happy_new_year 
##                   2548                   2539                   1938 
##             let_us_get          two_years_ago president_barack_obama 
##                   1853                   1573                   1479 
##         new_york_times          cinco_de_mayo             let_us_see 
##                   1381                   1313                   1114 
##           world_war_ii        st_louis_county looking_forward_seeing 
##                   1085                   1064                   1010 
##     gov_chris_christie       first_time_since          new_years_eve 
##                    923                    900                    885 
##        st_patricks_day           can_wait_get          two_weeks_ago 
##                    878                    832                    780 
##        three_years_ago         love_love_love            let_us_just 
##                    766                    754                    737 
##            let_us_make 
##                    709

tibble::enframe(quanteda::topfeatures(qt_dfm3, 25), name = "Terms", value = "Counts") %>% 
        ggplot(aes(x=reorder(Terms, Counts), y=Counts)) + geom_bar(stat = "identity") + coord_flip() + xlab("Top 25 Trigrams")

rm(t3); rm(qt_dfm3)

Further Plans

My plans for creating a prediction algorithm and Shiny app involve the following steps:

Calculating the probabilities for all available in the datas unigrams, bigrams, trigrams, etc and creating corresponding probability tables.
Developing and implementing an algorithm that would calculate the conditional probability for seeing a word given the previous ngram.
With respect to shiny app, the main goal is to make sure that the app will be able to run taking less than 1 Gb of memory space, which may require using a random sample of the data