library(ggplot2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(stringi)
dataURL<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dataDIR = "final"
if (!dir.exists(dataDIR)) {
dataZipName <- "Coursera-SwiftKey.zip"
if (!file.exists(dataZipName))
download.file(dataURL, dataZipName, method = "auto")
unzip(dataZipName)
if (dir.exists(dataDIR))
file.remove(dataZipName)
}
## [1] TRUE
file.blog <- "final/en_US/en_US.blogs.txt"
file.twitter <- "final/en_US/en_US.twitter.txt"
file.web <- "final/en_US/en_US.news.txt"
lines.blog <- readLines(file(file.blog))
lines.twitter <- readLines(file(file.twitter))
## Warning in readLines(file(file.twitter)): сторка 167155 похоже, содержит
## встроенный nul
## Warning in readLines(file(file.twitter)): сторка 268547 похоже, содержит
## встроенный nul
## Warning in readLines(file(file.twitter)): сторка 1274086 похоже, содержит
## встроенный nul
## Warning in readLines(file(file.twitter)): сторка 1759032 похоже, содержит
## встроенный nul
lines.web <- readLines(file(file.web))
## Warning in readLines(file(file.web)): неполная последняя строка найдена в
## 'final/en_US/en_US.news.txt'
length(lines.blog)
## [1] 899288
length(lines.twitter)
## [1] 2360148
length(lines.web)
## [1] 77259
words.blog <- stri_count_words(lines.blog)
words.twitter <- stri_count_words(lines.twitter)
words.web <- stri_count_words(lines.web)
summary(words.blog)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 29.00 42.29 60.00 6725.00
summary(words.twitter)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.79 18.00 47.00
summary(words.web)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 32.00 34.81 46.00 1123.00
slines.blog <- sample(lines.blog, 0.1*length(lines.blog))
slines.twitter <-sample(lines.blog,
0.1*length(lines.twitter))
slines.web <- sample(lines.blog, 0.1*length(lines.web))
swords.blog <- stri_count_words(slines.blog)
swords.twitter <- stri_count_words(slines.twitter)
swords.web <- stri_count_words(slines.web)
df.words.all <- data.frame(word = c(swords.blog, swords.twitter, swords.web),
type = c(rep("blog", length(swords.blog)), rep("twitter",length(swords.twitter)), rep("web", length(slines.web))))
ggplot(data = df.words.all) + geom_density(aes(word)) + facet_wrap(~type, nrow = 3) + xlim(0,500)
## Warning: Removed 97 rows containing non-finite values (stat_density).
##3. Has the data scientist made basic plots, such as histograms to illustrate features of the data?
webCorpus = Corpus(VectorSource(slines.web))
webCorpus = tm_map(webCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(webCorpus, content_transformer(tolower)):
## transformation drops documents
webCorpus = tm_map(webCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(webCorpus, removePunctuation):
## transformation drops documents
webCorpus = tm_map(webCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(webCorpus, removeNumbers): transformation
## drops documents
webDTM = TermDocumentMatrix(webCorpus,
control = list(minWordLength = 1))
mWeb = as.matrix(webDTM)
webOrder <- sort(rowSums(mWeb), decreasing = TRUE)
head(webOrder, 10)
## the and that for you with was this have but
## 16568 9721 4158 3169 2568 2538 2408 2290 1884 1803
tail(webOrder, 10)
## underestimate handless organist derisively swanking
## 1 1 1 1 1
## swankingвђќ вђњstop emphasising marriageвђ\231 вђ\230at
## 1 1 1 1 1