library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(openNLP)
library(RWeka)
library(Rstem)
library(SnowballC)
##
## Attaching package: 'SnowballC'
## The following objects are masked from 'package:Rstem':
##
## getStemLanguages, wordStem
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:base':
##
## Filter
First , I will shorten the corpus ,because , the corpus is too big , my computer will be very slowly . second , I will load the shorten corpus by the tm package , and check the meta information of corpus
setwd('~/R/NLP/Cousera-Swiftkey/en_US/')
blog <- readLines("~/R/NLP/Cousera-Swiftkey/en_US/en_US.blogs.txt",n=100)
news <- readLines("~/R/NLP/Cousera-Swiftkey/en_US/en_US.news.txt",n=100)
twitter <- readLines("~/R/NLP/Cousera-Swiftkey/en_US/en_US.twitter.txt",n=100)
writeLines(blog,'~/R/NLP/Cousera-Swiftkey/shorts/en_US.blogs.txt')
writeLines(news,'~/R/NLP/Cousera-Swiftkey/shorts/en_US.news.txt')
writeLines(twitter,'~/R/NLP/Cousera-Swiftkey/shorts/en_US.twitter.txt')
(ovid <- Corpus(DirSource('~/R/NLP/Cousera-Swiftkey/shorts'),
readerControl = list(reader = readPlain,
language = "en_US",
load = TRUE)))
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
inspect(ovid)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 25744
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 18920
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 6755
summary(ovid)
## Length Class Mode
## en_US.blogs.txt 2 PlainTextDocument list
## en_US.news.txt 2 PlainTextDocument list
## en_US.twitter.txt 2 PlainTextDocument list
meta(ovid[[1]])
## author : character(0)
## datetimestamp: 2017-02-19 09:55:42
## description : character(0)
## heading : character(0)
## id : en_US.blogs.txt
## language : en_US
## origin : character(0)
meta(ovid[[2]])
## author : character(0)
## datetimestamp: 2017-02-19 09:55:42
## description : character(0)
## heading : character(0)
## id : en_US.news.txt
## language : en_US
## origin : character(0)
meta(ovid[[3]])
## author : character(0)
## datetimestamp: 2017-02-19 09:55:42
## description : character(0)
## heading : character(0)
## id : en_US.twitter.txt
## language : en_US
## origin : character(0)
ovid <- tm_map(ovid, stripWhitespace)
ovid <- tm_map(ovid, content_transformer(tolower))
ovid <- tm_map(ovid, removeWords, stopwords("english"))
I will check the word frequency by the tm package , get a feeling of NLP
dtm <- DocumentTermMatrix(ovid)
plot(dtm, terms = findFreqTerms(dtm, lowfreq = 8)[1:10], corThreshold = 0.5)
inspect(removeSparseTerms(dtm, 0.1))
## <<DocumentTermMatrix (documents: 3, terms: 87)>>
## Non-/sparse entries: 261/0
## Sparsity : 0%
## Maximal term length: 9
## Weighting : term frequency (tf)
##
## Terms
## Docs according also always another around back better big
## en_US.blogs.txt 1 13 3 3 3 2 2 2
## en_US.news.txt 1 9 2 5 2 3 1 2
## en_US.twitter.txt 1 1 3 1 1 1 1 1
## Terms
## Docs birthday can care check come company day decided die
## en_US.blogs.txt 1 8 1 2 6 2 5 3 1
## en_US.news.txt 1 4 1 2 2 1 1 2 1
## en_US.twitter.txt 1 4 1 1 1 1 2 1 1
## Terms
## Docs done eat even every feel find first free fun game get
## en_US.blogs.txt 1 2 7 4 2 3 5 2 3 1 12
## en_US.news.txt 1 1 2 2 2 1 10 4 1 2 6
## en_US.twitter.txt 1 1 1 3 1 1 4 1 1 1 3
## Terms
## Docs going good got green help just know last life like
## en_US.blogs.txt 5 5 4 1 2 9 9 4 1 21
## en_US.news.txt 3 2 5 2 1 5 5 5 2 4
## en_US.twitter.txt 3 6 3 1 1 7 3 2 1 3
## Terms
## Docs list little long love make makes many might move must
## en_US.blogs.txt 1 5 2 4 6 2 7 1 1 3
## en_US.news.txt 1 1 1 2 7 1 1 4 1 3
## en_US.twitter.txt 1 1 2 5 2 1 2 1 1 1
## Terms
## Docs need never new next nice now now. one open others
## en_US.blogs.txt 5 5 7 3 7 4 1 11 1 1
## en_US.news.txt 1 1 13 3 1 3 1 6 1 1
## en_US.twitter.txt 1 2 3 2 1 1 1 2 1 1
## Terms
## Docs people ready really right room saw saying see seems
## en_US.blogs.txt 11 1 5 5 1 1 1 3 3
## en_US.news.txt 4 1 1 1 1 1 1 1 1
## en_US.twitter.txt 1 1 2 2 1 1 1 1 2
## Terms
## Docs set show shows state still support take talking thing
## en_US.blogs.txt 1 3 1 1 3 1 1 1 3
## en_US.news.txt 2 4 2 3 1 2 4 1 1
## en_US.twitter.txt 1 4 1 1 1 1 1 1 1
## Terms
## Docs think time trying want way week well white will work
## en_US.blogs.txt 2 9 3 2 5 3 2 3 18 3
## en_US.news.txt 4 7 2 3 1 1 1 1 12 1
## en_US.twitter.txt 3 5 1 1 1 2 1 1 8 2
## Terms
## Docs working
## en_US.blogs.txt 3
## en_US.news.txt 1
## en_US.twitter.txt 1
findFreqTerms(dtm, 10)
## [1] "also" "can" "even" "first" "get" "going" "good"
## [8] "got" "just" "know" "last" "like" "love" "make"
## [15] "many" "new" "one" "people" "said" "show" "time"
## [22] "two" "will"
findAssocs(dtm, "love", 0.9)
## $love
## good change know. tired across
## 1.00 0.98 0.98 0.98 0.94
## action. allow always blog bomb
## 0.94 0.94 0.94 0.94 0.94
## boy brings cake call commercial
## 0.94 0.94 0.94 0.94 0.94
## complete considered control cool death
## 0.94 0.94 0.94 0.94 0.94
## due experiencing front heard holiday
## 0.94 0.94 0.94 0.94 0.94
## live long loved lyrics machine
## 0.94 0.94 0.94 0.94 0.94
## maybe minutes months much, music
## 0.94 0.94 0.94 0.94 0.94
## needs nothing person. reaction reading
## 0.94 0.94 0.94 0.94 0.94
## save sleeping song sounds sunday
## 0.94 0.94 0.94 0.94 0.94
## taking today. together, towards type
## 0.94 0.94 0.94 0.94 0.94
## watched wedding weird whole wonderful
## 0.94 0.94 0.94 0.94 0.94
## meet
## 0.93
I will develop one shiny app, in this app , your n-gram probability will be different if you choose different topic or style.