First we download the data

The goal of this milestone is to build models for text predictions using Natural Languages Techniques. For this analyst it will be used large data text corpus as data training

# first download the required pabckadges.
library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
require(ggplot2)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
require(RWeka)
## Loading required package: RWeka
## Warning: package 'RWeka' was built under R version 3.4.2
# create a directory and download the required datasets 

if(!file.exists("./datasets")){dir.create("./datasets")}

fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip" 

if(!file.exists("./datasets/Coursera-SwiftKey.zip")){
   download.file(fileUrl,destfile="./datasets/Coursera-SwiftKey.zip",mode = "wb")
}

# Unzip the files

if(!file.exists("./datasets/final")){
    unzip(zipfile="./datasets/Coursera-SwiftKey.zip",exdir="./datasets")
}

list.files("./final/en_US")
## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

Now that we have all three files loaded it is time to sample the files for our study.

blogs <- readLines("./final/en_US/en_US.blogs.txt")
news <- readLines("./final/en_US/en_US.news.txt")
## Warning in readLines("./final/en_US/en_US.news.txt"): incomplete final line
## found on './final/en_US/en_US.news.txt'
twitter <- readLines("./final/en_US/en_US.twitter.txt")
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 167155
## appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 268547
## appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1274086
## appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1759032
## appears to contain an embedded nul
# Now we check the size of the three files in megabytes
blog_file_Size <- file.info("./final/en_US/en_US.blogs.txt")$size /1024^2

news_file_Size <- file.info("./final/en_US/en_US.news.txt")$size /1024^2

twitter_file_Size <- file.info("./final/en_US/en_US.twitter.txt")$size /1024^2

print(paste("News Data Length = ", length(news),
            ", News Blog Length = ", length(blogs),
            ", News twitter Length = ", length(twitter)
            ))
## [1] "News Data Length =  77259 , News Blog Length =  899288 , News twitter Length =  2360148"
library(tm)
library(stringi)

stri_stats_general(blogs)
##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   208361438   171926076
stri_stats_general(news)
##       Lines LinesNEmpty       Chars CharsNWhite 
##       77259       77259    15683765    13117038
stri_stats_general(twitter)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162384825   134370864
# Now the words counts

words_blogs <-stri_count_words(blogs)
words_twitter <-stri_count_words(twitter)
words_news <-stri_count_words(news)


# Now the summary of findings

dataset_view <- data.frame(source = c("blogs", "news", "twitter"),
          file.size.MB = c(blog_file_Size, news_file_Size, twitter_file_Size),
         num.lines = c(length(blogs), length(news), length(twitter)),
          num.words = c(sum(words_blogs), sum(words_news), sum(words_twitter)),
         mean.num.words = c(mean(words_blogs), mean(words_news), mean(words_twitter)))

print(dataset_view)
##    source file.size.MB num.lines num.words mean.num.words
## 1   blogs     200.4242    899288  38154238       42.42716
## 2    news     196.2775     77259   2693898       34.86840
## 3 twitter     159.3641   2360148  30218125       12.80349
filleSamples <- paste(news[1:10000],blogs[1:10000],twitter[1:10000])
newDataSets <- VCorpus(VectorSource(filleSamples))

#rm(filleSamples)

Now we I am going to clean the filles for analysis

newDataSets <- tm_map(newDataSets, stripWhitespace )

newDataSets <- tm_map(newDataSets, removePunctuation)

newDataSets <- tm_map(newDataSets, removeNumbers)
newDataSets <- tm_map(newDataSets, removeWords, stopwords())

Creating the corpus for the analysis

corpusDf <-data.frame(text=unlist(sapply(newDataSets, 
                  `[`, "content")), stringsAsFactors=F)

findNGrams <- function(corp, grams) {
  ngram <- NGramTokenizer(corp, Weka_control(min = grams, max = grams,
                      delimiters = " \\r\\n\\t.,;:\"()?!"))
  ngram2 <- data.frame(table(ngram))
  #pick only top 25
  ngram3 <- ngram2[order(ngram2$Freq,decreasing = TRUE),][1:100,]
  colnames(ngram3) <- c("String","Count")
  ngram3
}

TwoGrams <- findNGrams(corpusDf, 2)
ThreeGrams <- findNGrams(corpusDf, 3)
FourGrams <- findNGrams(corpusDf, 4)

Histograms of top the frequency

We will just build simple histograms for “TwoGrams, ThreeGrams, FourGrams” to better visualize our findings.

par(mfrow = c(1, 1) )
### Twograms barplot
barplot(TwoGrams[1:20,2], cex.names=0.5, names.arg=TwoGrams[1:20,1], col= c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan"), main="Top twoGrams", las=2)

### Threegrams barplot
barplot(ThreeGrams[1:20,2], cex.names=0.5, names.arg=ThreeGrams[1:20,1], col= c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan"), main="Top ThreeGrams", las=2)

### Fourgrams barplot
barplot(FourGrams[1:20,2], cex.names=0.5, names.arg= FourGrams[1:20,1], col= c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan"), main="Top FourGrams", las=2)

Conclusion

This is the conclusion of data processing and visualization. The next step is preparing data for reporting using shiny app.

For the prediction we will use the ngrams to predict the next word.Then the model will be integrated to an shiny app for the users interaction.