Preparation

Load the sample twitter file

twitter <- file("../Week1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", open = "r")
tw <- readLines(twitter)
## Warning in readLines(twitter): line 167155 appears to contain an embedded
## nul
## Warning in readLines(twitter): line 268547 appears to contain an embedded
## nul
## Warning in readLines(twitter): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(twitter): line 1759032 appears to contain an embedded
## nul
tws <- tw[1:200]
twstok <- strsplit(gsub("\\.|,|\\?|:|-|\\(|\\)|;|\\&|\\!", "", tolower(tws)), " ")
twstoku <- unlist(twstok)

Some words are more frequent than others - what are the distributions of word frequencies?

Plot for showing word frquencies

twsfreq <- table(twstoku)
twsfreq1 <- sort(twsfreq, decreasing=TRUE)
plot(head(twsfreq1, 20))

What are the frequencies of 2-grams and 3-grams in the dataset?

2-grams plot

two_gram <- ngrams(twstoku, 2)
tg <- list()
for (i in 1:length(two_gram)) {
        ul <- unlist(two_gram[i])
        s <- paste(ul[1],ul[2])
        tg[i] <- s
}
tgu <- unlist(tg)
two_gram_freq <- table(tgu)
two_gram_freq1 <- sort(two_gram_freq, decreasing = TRUE)
plot(head(two_gram_freq1, 20))

3-grams plot

three_gram <- ngrams(twstoku, 3L)
tg <- list()
for (i in 1:length(three_gram)) {
        ul <- unlist(three_gram[i])
        s <- paste(ul[1],ul[2], ul[3])
        tg[i] <- s
        
}
tgu <- unlist(tg)
three_gram_freq <- table(tgu)
three_gram_freq1 <- sort(three_gram_freq, decreasing = TRUE)
plot(head(three_gram_freq1, 20))

How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?

To cover 50%

i <- 1
s <- 0
while (s <= length(twstoku) * 0.5) {
        s <- s + twsfreq1[i]
        i <- i + 1
}
print(paste(s, i))
## [1] "1183 108"

To cover 90%

i <- 1
s <- 0
while (s <= length(twstoku) * 0.9) {
        s <- s + twsfreq1[i]
        i <- i + 1
}
print(paste(s, i))
## [1] "2127 846"

How do you evaluate how many of the words come from foreign languages?

I use least frequent words

twsfreq2 <- sort(twsfreq, decreasing=FALSE)
head(twsfreq2)
## twstoku
##         'cause            "as "collaboration            "d"          "high 
##              1              1              1              1              1 
##           "his 
##              1

Can you think of a way to increase the coverage

Using the alphbets of the languge and finding the words not including any of those alphabets

ml <- gsub("a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z", "", twstoku)
ml[lapply(ml, nchar)>0]
##   [1] "'"      "'"      "'"      "'"      "5"      "'"      "'"     
##   [8] "ًں‘¦"   "$163"   "#"      "“"    "\""     "\""     "â€‌"   
##  [15] "#"      "#"      "'"      "4"      "'"      "♥"    "'"     
##  [22] "<3"     "99%"    "#"      "\""     "\""     "~"      "99"    
##  [29] "'"      "#"      "\""     "\""     "40"     "'"      "23"    
##  [36] "'"      "#"      "4"      "/"      "$99"    "$"      "/"     
##  [43] "\""     "\""     "'"      "'"      "\""     "\""     "'"     
##  [50] "'"      "'"      "'"      "'"      ">>"     "#"      ">>>"   
##  [57] "#"      "'"      "#8"     "88"     "45347"  "#"      "#"     
##  [64] "43"     "<3"     "'"      "#"      ">>>>>>" "'"      "'"     
##  [71] "آ«"     "آ»"     "'"      "#"      "#"      "''"     "'"     
##  [78] "\""     "\""     "\""     "\""     "****"   "#"      "#"     
##  [85] "'"      "'"      "'"      "#"      "//"     "'"      "#"     
##  [92] "#"      "'"      "/"      "/"      "'"      "/"      "4"     
##  [99] "\"\""   "\"\""   "'"      "'"      "“"    "â€‌"    "'"     
## [106] "'"      "#"      "**"     "$60"    "\""     "\""     "20"    
## [113] "'"      "'"      "#"      "5000"   "#9"     "'"      "'"     
## [120] "30"     "\""     "\""     "10"     "#"      "2"      "2"     
## [127] "’"    "“"    "â€‌"    "/"      "5"      "10"     "'"     
## [134] "2"      "'"      "'"      "118"    "#13"    "'"      "#"     
## [141] "2"      "/"      "'"      "'"      "'"      "<3"     "+"     
## [148] "$200+"  "'"      "\""     "\""     "'"      "\""     "\""

Load the packages needed for exploring datasets

library(tm)
library(NLP)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.2
## Loading required package: RColorBrewer
library(stringi)
library(manipulate)
library(openNLP)
## Warning: package 'openNLP' was built under R version 3.4.2
library(RColorBrewer)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.2

Load and explore data

blogs <- file("../Week1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "r")
news <- file("../Week1/Coursera-SwiftKey/final/en_US/en_US.news.txt", "r")
tweets <- file("../Week1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "r")
blogdata <- readLines(blogs)
newsdata <- readLines(news)
## Warning in readLines(news): incomplete final line found on '../Week1/
## Coursera-SwiftKey/final/en_US/en_US.news.txt'
tweetdata <- readLines(tweets)
## Warning in readLines(tweets): line 167155 appears to contain an embedded
## nul
## Warning in readLines(tweets): line 268547 appears to contain an embedded
## nul
## Warning in readLines(tweets): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(tweets): line 1759032 appears to contain an embedded
## nul
data.Summary <- data.frame(Dataset = c("Blogs", "News", "Tweets"),
                       Filesize = c(file.size("../Week1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"),
                                    file.size("../Week1/Coursera-SwiftKey/final/en_US/en_US.news.txt"),
                                    file.size("../Week1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")),
                       Lines = c(length(blogdata),
                                 length(newsdata),
                                 length(tweetdata)),
                       Words = c(sum(sapply(strsplit(blogdata, " "), FUN = length, simplify = TRUE)),
                                 sum(sapply(strsplit(newsdata, " "), FUN = length, simplify = TRUE)),
                                 sum(sapply(strsplit(tweetdata, " "), FUN = length, simplify = TRUE))
                                 )
                       )
close(blogs)                  
close(news)
close(tweets)
print(data.Summary)
##   Dataset  Filesize   Lines    Words
## 1   Blogs 210160014  899288 37334131
## 2    News 205811889   77259  2643969
## 3  Tweets 167105338 2360148 30373543

Create a sample corpus to explore data

set.seed(1234)
blogs.Sample <- sample(blogdata, length(blogdata)*0.1, replace=FALSE)
news.Sample <- sample(newsdata, length(newsdata)*0.1, replace=FALSE)
tweets.Sample <- sample(tweetdata, length(tweetdata)*0.1, replace=FALSE)

sample.Corpus <- c(blogs.Sample, news.Sample, tweets.Sample)
sample.Corpus <- VCorpus(VectorSource(sample.Corpus))

sample.Corpus <- tm_map(sample.Corpus, removeNumbers)
sample.Corpus <- tm_map(sample.Corpus, removePunctuation)
sample.Corpus <- tm_map(sample.Corpus, stripWhitespace)
sample.Corpus <- tm_map(sample.Corpus, content_transformer(tolower))
sample.Corpus <- tm_map(sample.Corpus, removeWords, stopwords("english"))
sample.Corpus <- tm_map(sample.Corpus, PlainTextDocument)
unigram <- function(x) NGramTokenizer(x, Weka_control(min =1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min =2, max = 2))
unigram.tdm <- TermDocumentMatrix(sample.Corpus, control = list(tokenize = unigram))
bigram.tdm <- TermDocumentMatrix(sample.Corpus, control = list(tokenize = bigram))
unigram.tdm.temp <- removeSparseTerms(unigram.tdm, sparse = 0.99)
bigram.tdm.temp <- removeSparseTerms(bigram.tdm, sparse = 0.999)
unitdmf <- sort(rowSums(as.matrix(unigram.tdm.temp)), decreasing=TRUE)
bitdmf <- sort(rowSums(as.matrix(bigram.tdm.temp)), decreasing=TRUE)

Data exploration - unigrams

barplot(head(unitdmf,5), main = "Most Frequent Unigrams - Top 5", col="deepskyblue1")

wordcloud(names(unitdmf), unitdmf, colors = brewer.pal(6, "Paired"))

Data exploration - bigrams

barplot(head(bitdmf,5), main = "Most Frequent Bigrams - Top 5", col="deepskyblue1")

wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6, "Paired"))
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): cant wait could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): dont know could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): right now could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): last night could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): can get could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): happy birthday could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): thanks following could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): even though could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): thanks much could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): look like could not be fit on page. It will not be plotted.

Plan for creating app

Plan for creating a prediction algorithm and Shiny app The above analysis summariseS the most frequently used words in the corpus provided to us. We will use this learning from these findings and build a predictive model based on commonly used n-grams. We will prioritize suggestions according to popularity of the words used, so that the users of our model may select from a list of most commonly used words which could be 1, 2 or 3 words in that particular order.

Also, since the size of the training data is large, we will create a sampling strategy to down size the training data.

Finally, we will deploy our model in a Shiny App from which users will be able to enter a short phrase, and then the Shinay App could suggest the most suitable next word by using our predictive model.