blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
## Warning in file(con, "r"): cannot open file 'en_US.blogs.txt': No such file or
## directory
## Error in file(con, "r"): cannot open the connection
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
## Warning in file(con, "r"): cannot open file 'en_US.news.txt': No such file or
## directory
## Error in file(con, "r"): cannot open the connection
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
## Warning in file(con, "r"): cannot open file 'en_US.twitter.txt': No such file or
## directory
## Error in file(con, "r"): cannot open the connection
I set the directory and load 3 data.
size_blogs<-file.size(path="D:/1-1. R studio/Lecture10. Data science capstone/week2/final/en_US/en_US.blogs.txt")/2^20
size_news<-file.size(path="D:/1-1. R studio/Lecture10. Data science capstone/week2/final/en_US/en_US.news.txt")/2^20
size_twitter<-file.size(path="D:/1-1. R studio/Lecture10. Data science capstone/week2/final/en_US/en_US.twitter.txt")/2^20
len_blogs<-length(blogs)
## Error in eval(expr, envir, enclos): object 'blogs' not found
len_news<-length(news)
len_twitter<-length(twitter)
## Error in eval(expr, envir, enclos): object 'twitter' not found
nchar_blogs<-sum(nchar(blogs))
## Error in nchar(blogs): object 'blogs' not found
nchar_news<-sum(nchar(news))
## Error in nchar(news): cannot coerce type 'closure' to vector of type 'character'
nchar_twitter<-sum(nchar(twitter))
## Error in nchar(twitter): object 'twitter' not found
library(stringi)
## Warning: package 'stringi' was built under R version 4.0.2
nword_blogs<-stri_stats_latex(blogs)[4]
## Error in stri_stats_latex(blogs): object 'blogs' not found
nword_news<-stri_stats_latex(news)[4]
## Error in stri_stats_latex(news): argument `str` should be a character vector (or an object coercible to)
nword_twitter<-stri_stats_latex(twitter)[4]
## Error in stri_stats_latex(twitter): object 'twitter' not found
table<-data.frame("File Name"=c("Blogs","News","Twitter"),
"File Size(MB)"=c(size_blogs,size_news,size_twitter),
"Num of rows"=c(len_blogs,len_news,len_twitter),
"Num of character"=c(nchar_blogs,nchar_news,nchar_twitter),
"Num of words"=c(nword_blogs,nword_news,nword_twitter))
## Error in data.frame(`File Name` = c("Blogs", "News", "Twitter"), `File Size(MB)` = c(size_blogs, : object 'len_blogs' not found
table
## function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no",
## "ifany", "always"), dnn = list.names(...), deparse.level = 1)
## {
## list.names <- function(...) {
## l <- as.list(substitute(list(...)))[-1L]
## nm <- names(l)
## fixup <- if (is.null(nm))
## seq_along(l)
## else nm == ""
## dep <- vapply(l[fixup], function(x) switch(deparse.level +
## 1, "", if (is.symbol(x)) as.character(x) else "",
## deparse(x, nlines = 1)[1L]), "")
## if (is.null(nm))
## dep
## else {
## nm[fixup] <- dep
## nm
## }
## }
## miss.use <- missing(useNA)
## miss.exc <- missing(exclude)
## useNA <- if (miss.use && !miss.exc && !match(NA, exclude,
## nomatch = 0L))
## "ifany"
## else match.arg(useNA)
## doNA <- useNA != "no"
## if (!miss.use && !miss.exc && doNA && match(NA, exclude,
## nomatch = 0L))
## warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
## args <- list(...)
## if (!length(args))
## stop("nothing to tabulate")
## if (length(args) == 1L && is.list(args[[1L]])) {
## args <- args[[1L]]
## if (length(dnn) != length(args))
## dnn <- if (!is.null(argn <- names(args)))
## argn
## else paste(dnn[1L], seq_along(args), sep = ".")
## }
## bin <- 0L
## lens <- NULL
## dims <- integer()
## pd <- 1L
## dn <- NULL
## for (a in args) {
## if (is.null(lens))
## lens <- length(a)
## else if (length(a) != lens)
## stop("all arguments must have the same length")
## fact.a <- is.factor(a)
## if (doNA)
## aNA <- anyNA(a)
## if (!fact.a) {
## a0 <- a
## a <- factor(a, exclude = exclude)
## }
## add.na <- doNA
## if (add.na) {
## ifany <- (useNA == "ifany")
## anNAc <- anyNA(a)
## add.na <- if (!ifany || anNAc) {
## ll <- levels(a)
## if (add.ll <- !anyNA(ll)) {
## ll <- c(ll, NA)
## TRUE
## }
## else if (!ifany && !anNAc)
## FALSE
## else TRUE
## }
## else FALSE
## }
## if (add.na)
## a <- factor(a, levels = ll, exclude = NULL)
## else ll <- levels(a)
## a <- as.integer(a)
## if (fact.a && !miss.exc) {
## ll <- ll[keep <- which(match(ll, exclude, nomatch = 0L) ==
## 0L)]
## a <- match(a, keep)
## }
## else if (!fact.a && add.na) {
## if (ifany && !aNA && add.ll) {
## ll <- ll[!is.na(ll)]
## is.na(a) <- match(a0, c(exclude, NA), nomatch = 0L) >
## 0L
## }
## else {
## is.na(a) <- match(a0, exclude, nomatch = 0L) >
## 0L
## }
## }
## nl <- length(ll)
## dims <- c(dims, nl)
## if (prod(dims) > .Machine$integer.max)
## stop("attempt to make a table with >= 2^31 elements")
## dn <- c(dn, list(ll))
## bin <- bin + pd * (a - 1L)
## pd <- pd * nl
## }
## names(dn) <- dnn
## bin <- bin[!is.na(bin)]
## if (length(bin))
## bin <- bin + 1L
## y <- array(tabulate(bin, pd), dims, dimnames = dn)
## class(y) <- "table"
## y
## }
## <bytecode: 0x0000000015b34298>
## <environment: namespace:base>
Summarize the contents, which has file size, number of rows, number of character and number of words in each file. And make the table
set.seed(12345)
blogs1<-iconv(blogs,"latin1","ASCII",sub="")
## Error in iconv(blogs, "latin1", "ASCII", sub = ""): object 'blogs' not found
news1<-iconv(news,"latin1","ASCII",sub="")
## Error in as.character(x): cannot coerce type 'closure' to vector of type 'character'
twitter1<-iconv(twitter,"latin1","ASCII",sub="")
## Error in iconv(twitter, "latin1", "ASCII", sub = ""): object 'twitter' not found
rm(blogs)
## Warning in rm(blogs): object 'blogs' not found
rm(news)
## Warning in rm(news): object 'news' not found
rm(twitter)
## Warning in rm(twitter): object 'twitter' not found
# sample data set only 1% of each file
sample_data<-c(sample(blogs1,length(blogs1)*0.01),
sample(news1,length(news1)*0.01),
sample(twitter1,length(twitter1)*0.01))
## Error in sample(blogs1, length(blogs1) * 0.01): object 'blogs1' not found
rm(blogs1)
## Warning in rm(blogs1): object 'blogs1' not found
rm(news1)
## Warning in rm(news1): object 'news1' not found
rm(twitter1)
## Warning in rm(twitter1): object 'twitter1' not found
Data sets are really big, so using sample() function, I sample 1% of each file.
library(tm)
## Warning: package 'tm' was built under R version 4.0.2
## Loading required package: NLP
library(NLP)
corpus<-VCorpus(VectorSource(sample_data))
## Error in SimpleSource(length = length(x), content = x, class = "VectorSource"): object 'sample_data' not found
corpus1<-tm_map(corpus,removePunctuation)
## Error in tm_map(corpus, removePunctuation): object 'corpus' not found
corpus2<-tm_map(corpus1,stripWhitespace)
## Error in tm_map(corpus1, stripWhitespace): object 'corpus1' not found
corpus3<-tm_map(corpus2,tolower)
## Error in tm_map(corpus2, tolower): object 'corpus2' not found
corpus4<-tm_map(corpus3,removeNumbers)
## Error in tm_map(corpus3, removeNumbers): object 'corpus3' not found
corpus5<-tm_map(corpus4,PlainTextDocument)
## Error in tm_map(corpus4, PlainTextDocument): object 'corpus4' not found
corpus6<-tm_map(corpus5,removeWords,stopwords("english"))
## Error in tm_map(corpus5, removeWords, stopwords("english")): object 'corpus5' not found
corpus_result<-data.frame(text=unlist(sapply(corpus6,'[',"content")),stringsAsFactors = FALSE)
## Error in lapply(X = X, FUN = FUN, ...): object 'corpus6' not found
head(corpus_result)
## Error in head(corpus_result): object 'corpus_result' not found
rm(corpus)
## Warning in rm(corpus): object 'corpus' not found
rm(corpus1)
## Warning in rm(corpus1): object 'corpus1' not found
rm(corpus2)
## Warning in rm(corpus2): object 'corpus2' not found
rm(corpus3)
## Warning in rm(corpus3): object 'corpus3' not found
rm(corpus4)
## Warning in rm(corpus4): object 'corpus4' not found
rm(corpus5)
## Warning in rm(corpus5): object 'corpus5' not found
Build corpus, and check it making data frame.
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.0.2
## Error: package or namespace load failed for 'RWeka':
## .onLoad failed in loadNamespace() for 'rJava', details:
## call: fun(libname, pkgname)
## error: JAVA_HOME cannot be determined from the Registry
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
## Error in TermDocumentMatrix(corpus6, control = list(tokenize = one)): object 'corpus6' not found
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
## Error in TermDocumentMatrix(corpus6, control = list(tokenize = two)): object 'corpus6' not found
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))
## Error in TermDocumentMatrix(corpus6, control = list(tokenize = thr)): object 'corpus6' not found
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
## Error in stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), : object 'one_table' not found
two_corpus<-findFreqTerms(two_table,lowfreq=80)
## Error in stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), : object 'two_table' not found
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)
## Error in stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), : object 'thr_table' not found
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
## Error in as.matrix(one_table[one_corpus, ]): object 'one_table' not found
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
## Error in data.frame(Word = names(one_corpus_num), frequency = one_corpus_num): object 'one_corpus_num' not found
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
## Error in eval(expr, envir, enclos): object 'one_corpus_table' not found
head(one_corpus_sort)
## Error in head(one_corpus_sort): object 'one_corpus_sort' not found
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
## Error in as.matrix(two_table[two_corpus, ]): object 'two_table' not found
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
## Error in data.frame(Word = names(two_corpus_num), frequency = two_corpus_num): object 'two_corpus_num' not found
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
## Error in eval(expr, envir, enclos): object 'two_corpus_table' not found
head(two_corpus_sort)
## Error in head(two_corpus_sort): object 'two_corpus_sort' not found
thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
## Error in as.matrix(thr_table[thr_corpus, ]): object 'thr_table' not found
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
## Error in data.frame(Word = names(thr_corpus_num), frequency = thr_corpus_num): object 'thr_corpus_num' not found
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
## Error in eval(expr, envir, enclos): object 'thr_corpus_table' not found
head(thr_corpus_sort)
## Error in head(thr_corpus_sort): object 'thr_corpus_sort' not found
Extract the word and frequency of N-grams.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
## Error in ggplot(one_corpus_sort[1:10, ], aes(x = reorder(Word, -frequency), : object 'one_corpus_sort' not found
one_g<-one_g+geom_bar(stat="identity")
## Error in eval(expr, envir, enclos): object 'one_g' not found
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
## Error in eval(expr, envir, enclos): object 'one_g' not found
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
## Error in eval(expr, envir, enclos): object 'one_g' not found
one_g
## Error in eval(expr, envir, enclos): object 'one_g' not found
two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
## Error in ggplot(two_corpus_sort[1:10, ], aes(x = reorder(Word, -frequency), : object 'two_corpus_sort' not found
two_g<-two_g+geom_bar(stat="identity")
## Error in eval(expr, envir, enclos): object 'two_g' not found
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
## Error in eval(expr, envir, enclos): object 'two_g' not found
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
## Error in eval(expr, envir, enclos): object 'two_g' not found
two_g
## Error in eval(expr, envir, enclos): object 'two_g' not found
thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
## Error in ggplot(thr_corpus_sort[1:10, ], aes(x = reorder(Word, -frequency), : object 'thr_corpus_sort' not found
thr_g<-thr_g+geom_bar(stat="identity")
## Error in eval(expr, envir, enclos): object 'thr_g' not found
thr_g<-thr_g+labs(title="Trigrams",x="Words",y="Frequency")
## Error in eval(expr, envir, enclos): object 'thr_g' not found
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90))
## Error in eval(expr, envir, enclos): object 'thr_g' not found
thr_g
## Error in eval(expr, envir, enclos): object 'thr_g' not found
Plot graphs of each N-gram words. I can confirm which word is the most frequency in those files.
I do analyze initially. Next, I will make a predictive algorithm, and using shiny() app, I will check the result which input is coming.