0. Load the libarary

library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(openNLP)
library(RWeka)
library(Rstem)
library(SnowballC)
## 
## Attaching package: 'SnowballC'
## The following objects are masked from 'package:Rstem':
## 
##     getStemLanguages, wordStem
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## 
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
## 
##     %+%
## Loading required package: qdapTools
## 
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following object is masked from 'package:base':
## 
##     Filter

1. Demonstrate that you’ve downloaded the data and have successfully loaded it in

First , I will shorten the corpus ,because , the corpus is too big , my computer will be very slowly . second , I will load the shorten corpus by the tm package , and check the meta information of corpus

setwd('~/R/NLP/Cousera-Swiftkey/en_US/')



blog <-  readLines("~/R/NLP/Cousera-Swiftkey/en_US/en_US.blogs.txt",n=100)

news <-  readLines("~/R/NLP/Cousera-Swiftkey/en_US/en_US.news.txt",n=100)

twitter <-  readLines("~/R/NLP/Cousera-Swiftkey/en_US/en_US.twitter.txt",n=100)

writeLines(blog,'~/R/NLP/Cousera-Swiftkey/shorts/en_US.blogs.txt')
writeLines(news,'~/R/NLP/Cousera-Swiftkey/shorts/en_US.news.txt')
writeLines(twitter,'~/R/NLP/Cousera-Swiftkey/shorts/en_US.twitter.txt')


(ovid <- Corpus(DirSource('~/R/NLP/Cousera-Swiftkey/shorts'),
                                readerControl = list(reader = readPlain,
                                                       language = "en_US",
                                                        load = TRUE)))
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
inspect(ovid)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 25744
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 18920
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 6755
summary(ovid)
##                   Length Class             Mode
## en_US.blogs.txt   2      PlainTextDocument list
## en_US.news.txt    2      PlainTextDocument list
## en_US.twitter.txt 2      PlainTextDocument list
meta(ovid[[1]])
##   author       : character(0)
##   datetimestamp: 2017-02-19 09:55:42
##   description  : character(0)
##   heading      : character(0)
##   id           : en_US.blogs.txt
##   language     : en_US
##   origin       : character(0)
meta(ovid[[2]])
##   author       : character(0)
##   datetimestamp: 2017-02-19 09:55:42
##   description  : character(0)
##   heading      : character(0)
##   id           : en_US.news.txt
##   language     : en_US
##   origin       : character(0)
meta(ovid[[3]])
##   author       : character(0)
##   datetimestamp: 2017-02-19 09:55:42
##   description  : character(0)
##   heading      : character(0)
##   id           : en_US.twitter.txt
##   language     : en_US
##   origin       : character(0)

preprocessing , to lower the corpus , remove the stopwords

ovid <- tm_map(ovid, stripWhitespace)
ovid <- tm_map(ovid, content_transformer(tolower))

ovid <- tm_map(ovid, removeWords, stopwords("english"))

2. Create a basic report of summary statistics about the data sets.

I will check the word frequency by the tm package , get a feeling of NLP

dtm <- DocumentTermMatrix(ovid)

plot(dtm, terms = findFreqTerms(dtm, lowfreq = 8)[1:10], corThreshold = 0.5)

inspect(removeSparseTerms(dtm, 0.1))
## <<DocumentTermMatrix (documents: 3, terms: 87)>>
## Non-/sparse entries: 261/0
## Sparsity           : 0%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##                    Terms
## Docs                according also always another around back better big
##   en_US.blogs.txt           1   13      3       3      3    2      2   2
##   en_US.news.txt            1    9      2       5      2    3      1   2
##   en_US.twitter.txt         1    1      3       1      1    1      1   1
##                    Terms
## Docs                birthday can care check come company day decided die
##   en_US.blogs.txt          1   8    1     2    6       2   5       3   1
##   en_US.news.txt           1   4    1     2    2       1   1       2   1
##   en_US.twitter.txt        1   4    1     1    1       1   2       1   1
##                    Terms
## Docs                done eat even every feel find first free fun game get
##   en_US.blogs.txt      1   2    7     4    2    3     5    2   3    1  12
##   en_US.news.txt       1   1    2     2    2    1    10    4   1    2   6
##   en_US.twitter.txt    1   1    1     3    1    1     4    1   1    1   3
##                    Terms
## Docs                going good got green help just know last life like
##   en_US.blogs.txt       5    5   4     1    2    9    9    4    1   21
##   en_US.news.txt        3    2   5     2    1    5    5    5    2    4
##   en_US.twitter.txt     3    6   3     1    1    7    3    2    1    3
##                    Terms
## Docs                list little long love make makes many might move must
##   en_US.blogs.txt      1      5    2    4    6     2    7     1    1    3
##   en_US.news.txt       1      1    1    2    7     1    1     4    1    3
##   en_US.twitter.txt    1      1    2    5    2     1    2     1    1    1
##                    Terms
## Docs                need never new next nice now now. one open others
##   en_US.blogs.txt      5     5   7    3    7   4    1  11    1      1
##   en_US.news.txt       1     1  13    3    1   3    1   6    1      1
##   en_US.twitter.txt    1     2   3    2    1   1    1   2    1      1
##                    Terms
## Docs                people ready really right room saw saying see seems
##   en_US.blogs.txt       11     1      5     5    1   1      1   3     3
##   en_US.news.txt         4     1      1     1    1   1      1   1     1
##   en_US.twitter.txt      1     1      2     2    1   1      1   1     2
##                    Terms
## Docs                set show shows state still support take talking thing
##   en_US.blogs.txt     1    3     1     1     3       1    1       1     3
##   en_US.news.txt      2    4     2     3     1       2    4       1     1
##   en_US.twitter.txt   1    4     1     1     1       1    1       1     1
##                    Terms
## Docs                think time trying want way week well white will work
##   en_US.blogs.txt       2    9      3    2   5    3    2     3   18    3
##   en_US.news.txt        4    7      2    3   1    1    1     1   12    1
##   en_US.twitter.txt     3    5      1    1   1    2    1     1    8    2
##                    Terms
## Docs                working
##   en_US.blogs.txt         3
##   en_US.news.txt          1
##   en_US.twitter.txt       1
findFreqTerms(dtm, 10)
##  [1] "also"   "can"    "even"   "first"  "get"    "going"  "good"  
##  [8] "got"    "just"   "know"   "last"   "like"   "love"   "make"  
## [15] "many"   "new"    "one"    "people" "said"   "show"   "time"  
## [22] "two"    "will"
findAssocs(dtm, "love", 0.9)
## $love
##         good       change        know.        tired       across 
##         1.00         0.98         0.98         0.98         0.94 
##      action.        allow       always         blog         bomb 
##         0.94         0.94         0.94         0.94         0.94 
##          boy       brings         cake         call   commercial 
##         0.94         0.94         0.94         0.94         0.94 
##     complete   considered      control         cool        death 
##         0.94         0.94         0.94         0.94         0.94 
##          due experiencing        front        heard      holiday 
##         0.94         0.94         0.94         0.94         0.94 
##         live         long        loved       lyrics      machine 
##         0.94         0.94         0.94         0.94         0.94 
##        maybe      minutes       months        much,        music 
##         0.94         0.94         0.94         0.94         0.94 
##        needs      nothing      person.     reaction      reading 
##         0.94         0.94         0.94         0.94         0.94 
##         save     sleeping         song       sounds       sunday 
##         0.94         0.94         0.94         0.94         0.94 
##       taking       today.    together,      towards         type 
##         0.94         0.94         0.94         0.94         0.94 
##      watched      wedding        weird        whole    wonderful 
##         0.94         0.94         0.94         0.94         0.94 
##         meet 
##         0.93

3. Report any interesting findings that you amassed so far

  1. the language has statistical property , for example, the adverbs have more frequency
  2. statistical property will vary in different type articles, for example, the blog will use more simple words .

4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

I will develop one shiny app, in this app , your n-gram probability will be different if you choose different topic or style.