Load data

blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
## Warning in file(con, "r"): cannot open file 'en_US.blogs.txt': No such file or
## directory
## Error in file(con, "r"): cannot open the connection
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
## Warning in file(con, "r"): cannot open file 'en_US.news.txt': No such file or
## directory
## Error in file(con, "r"): cannot open the connection
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
## Warning in file(con, "r"): cannot open file 'en_US.twitter.txt': No such file or
## directory
## Error in file(con, "r"): cannot open the connection

I set the directory and load 3 data.

Summarize data

size_blogs<-file.size(path="D:/1-1. R studio/Lecture10. Data science capstone/week2/final/en_US/en_US.blogs.txt")/2^20
size_news<-file.size(path="D:/1-1. R studio/Lecture10. Data science capstone/week2/final/en_US/en_US.news.txt")/2^20
size_twitter<-file.size(path="D:/1-1. R studio/Lecture10. Data science capstone/week2/final/en_US/en_US.twitter.txt")/2^20
len_blogs<-length(blogs)
## Error in eval(expr, envir, enclos): object 'blogs' not found
len_news<-length(news)
len_twitter<-length(twitter)
## Error in eval(expr, envir, enclos): object 'twitter' not found
nchar_blogs<-sum(nchar(blogs))
## Error in nchar(blogs): object 'blogs' not found
nchar_news<-sum(nchar(news))
## Error in nchar(news): cannot coerce type 'closure' to vector of type 'character'
nchar_twitter<-sum(nchar(twitter))
## Error in nchar(twitter): object 'twitter' not found
library(stringi)
## Warning: package 'stringi' was built under R version 4.0.2
nword_blogs<-stri_stats_latex(blogs)[4]
## Error in stri_stats_latex(blogs): object 'blogs' not found
nword_news<-stri_stats_latex(news)[4]
## Error in stri_stats_latex(news): argument `str` should be a character vector (or an object coercible to)
nword_twitter<-stri_stats_latex(twitter)[4]
## Error in stri_stats_latex(twitter): object 'twitter' not found
table<-data.frame("File Name"=c("Blogs","News","Twitter"),
                  "File Size(MB)"=c(size_blogs,size_news,size_twitter),
                  "Num of rows"=c(len_blogs,len_news,len_twitter),
                  "Num of character"=c(nchar_blogs,nchar_news,nchar_twitter),
                  "Num of words"=c(nword_blogs,nword_news,nword_twitter))
## Error in data.frame(`File Name` = c("Blogs", "News", "Twitter"), `File Size(MB)` = c(size_blogs, : object 'len_blogs' not found
table
## function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no", 
##     "ifany", "always"), dnn = list.names(...), deparse.level = 1) 
## {
##     list.names <- function(...) {
##         l <- as.list(substitute(list(...)))[-1L]
##         nm <- names(l)
##         fixup <- if (is.null(nm)) 
##             seq_along(l)
##         else nm == ""
##         dep <- vapply(l[fixup], function(x) switch(deparse.level + 
##             1, "", if (is.symbol(x)) as.character(x) else "", 
##             deparse(x, nlines = 1)[1L]), "")
##         if (is.null(nm)) 
##             dep
##         else {
##             nm[fixup] <- dep
##             nm
##         }
##     }
##     miss.use <- missing(useNA)
##     miss.exc <- missing(exclude)
##     useNA <- if (miss.use && !miss.exc && !match(NA, exclude, 
##         nomatch = 0L)) 
##         "ifany"
##     else match.arg(useNA)
##     doNA <- useNA != "no"
##     if (!miss.use && !miss.exc && doNA && match(NA, exclude, 
##         nomatch = 0L)) 
##         warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
##     args <- list(...)
##     if (!length(args)) 
##         stop("nothing to tabulate")
##     if (length(args) == 1L && is.list(args[[1L]])) {
##         args <- args[[1L]]
##         if (length(dnn) != length(args)) 
##             dnn <- if (!is.null(argn <- names(args))) 
##                 argn
##             else paste(dnn[1L], seq_along(args), sep = ".")
##     }
##     bin <- 0L
##     lens <- NULL
##     dims <- integer()
##     pd <- 1L
##     dn <- NULL
##     for (a in args) {
##         if (is.null(lens)) 
##             lens <- length(a)
##         else if (length(a) != lens) 
##             stop("all arguments must have the same length")
##         fact.a <- is.factor(a)
##         if (doNA) 
##             aNA <- anyNA(a)
##         if (!fact.a) {
##             a0 <- a
##             a <- factor(a, exclude = exclude)
##         }
##         add.na <- doNA
##         if (add.na) {
##             ifany <- (useNA == "ifany")
##             anNAc <- anyNA(a)
##             add.na <- if (!ifany || anNAc) {
##                 ll <- levels(a)
##                 if (add.ll <- !anyNA(ll)) {
##                   ll <- c(ll, NA)
##                   TRUE
##                 }
##                 else if (!ifany && !anNAc) 
##                   FALSE
##                 else TRUE
##             }
##             else FALSE
##         }
##         if (add.na) 
##             a <- factor(a, levels = ll, exclude = NULL)
##         else ll <- levels(a)
##         a <- as.integer(a)
##         if (fact.a && !miss.exc) {
##             ll <- ll[keep <- which(match(ll, exclude, nomatch = 0L) == 
##                 0L)]
##             a <- match(a, keep)
##         }
##         else if (!fact.a && add.na) {
##             if (ifany && !aNA && add.ll) {
##                 ll <- ll[!is.na(ll)]
##                 is.na(a) <- match(a0, c(exclude, NA), nomatch = 0L) > 
##                   0L
##             }
##             else {
##                 is.na(a) <- match(a0, exclude, nomatch = 0L) > 
##                   0L
##             }
##         }
##         nl <- length(ll)
##         dims <- c(dims, nl)
##         if (prod(dims) > .Machine$integer.max) 
##             stop("attempt to make a table with >= 2^31 elements")
##         dn <- c(dn, list(ll))
##         bin <- bin + pd * (a - 1L)
##         pd <- pd * nl
##     }
##     names(dn) <- dnn
##     bin <- bin[!is.na(bin)]
##     if (length(bin)) 
##         bin <- bin + 1L
##     y <- array(tabulate(bin, pd), dims, dimnames = dn)
##     class(y) <- "table"
##     y
## }
## <bytecode: 0x0000000015b34298>
## <environment: namespace:base>

Summarize the contents, which has file size, number of rows, number of character and number of words in each file. And make the table

Clean data

set.seed(12345)
blogs1<-iconv(blogs,"latin1","ASCII",sub="")
## Error in iconv(blogs, "latin1", "ASCII", sub = ""): object 'blogs' not found
news1<-iconv(news,"latin1","ASCII",sub="")
## Error in as.character(x): cannot coerce type 'closure' to vector of type 'character'
twitter1<-iconv(twitter,"latin1","ASCII",sub="")
## Error in iconv(twitter, "latin1", "ASCII", sub = ""): object 'twitter' not found
rm(blogs)
## Warning in rm(blogs): object 'blogs' not found
rm(news)
## Warning in rm(news): object 'news' not found
rm(twitter)
## Warning in rm(twitter): object 'twitter' not found
# sample data set only 1% of each file
sample_data<-c(sample(blogs1,length(blogs1)*0.01),
               sample(news1,length(news1)*0.01),
               sample(twitter1,length(twitter1)*0.01))
## Error in sample(blogs1, length(blogs1) * 0.01): object 'blogs1' not found
rm(blogs1)
## Warning in rm(blogs1): object 'blogs1' not found
rm(news1)
## Warning in rm(news1): object 'news1' not found
rm(twitter1)
## Warning in rm(twitter1): object 'twitter1' not found

Data sets are really big, so using sample() function, I sample 1% of each file.

Build corpus

library(tm)
## Warning: package 'tm' was built under R version 4.0.2
## Loading required package: NLP
library(NLP)
corpus<-VCorpus(VectorSource(sample_data))
## Error in SimpleSource(length = length(x), content = x, class = "VectorSource"): object 'sample_data' not found
corpus1<-tm_map(corpus,removePunctuation)
## Error in tm_map(corpus, removePunctuation): object 'corpus' not found
corpus2<-tm_map(corpus1,stripWhitespace)
## Error in tm_map(corpus1, stripWhitespace): object 'corpus1' not found
corpus3<-tm_map(corpus2,tolower)
## Error in tm_map(corpus2, tolower): object 'corpus2' not found
corpus4<-tm_map(corpus3,removeNumbers)
## Error in tm_map(corpus3, removeNumbers): object 'corpus3' not found
corpus5<-tm_map(corpus4,PlainTextDocument)
## Error in tm_map(corpus4, PlainTextDocument): object 'corpus4' not found
corpus6<-tm_map(corpus5,removeWords,stopwords("english"))
## Error in tm_map(corpus5, removeWords, stopwords("english")): object 'corpus5' not found
corpus_result<-data.frame(text=unlist(sapply(corpus6,'[',"content")),stringsAsFactors = FALSE)
## Error in lapply(X = X, FUN = FUN, ...): object 'corpus6' not found
head(corpus_result)
## Error in head(corpus_result): object 'corpus_result' not found
rm(corpus)
## Warning in rm(corpus): object 'corpus' not found
rm(corpus1)
## Warning in rm(corpus1): object 'corpus1' not found
rm(corpus2)
## Warning in rm(corpus2): object 'corpus2' not found
rm(corpus3)
## Warning in rm(corpus3): object 'corpus3' not found
rm(corpus4)
## Warning in rm(corpus4): object 'corpus4' not found
rm(corpus5)
## Warning in rm(corpus5): object 'corpus5' not found

Build corpus, and check it making data frame.

Build N-gram

library(RWeka)
## Warning: package 'RWeka' was built under R version 4.0.2
## Error: package or namespace load failed for 'RWeka':
##  .onLoad failed in loadNamespace() for 'rJava', details:
##   call: fun(libname, pkgname)
##   error: JAVA_HOME cannot be determined from the Registry
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
## Error in TermDocumentMatrix(corpus6, control = list(tokenize = one)): object 'corpus6' not found
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
## Error in TermDocumentMatrix(corpus6, control = list(tokenize = two)): object 'corpus6' not found
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))
## Error in TermDocumentMatrix(corpus6, control = list(tokenize = thr)): object 'corpus6' not found
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
## Error in stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), : object 'one_table' not found
two_corpus<-findFreqTerms(two_table,lowfreq=80)
## Error in stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), : object 'two_table' not found
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)
## Error in stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), : object 'thr_table' not found
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
## Error in as.matrix(one_table[one_corpus, ]): object 'one_table' not found
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
## Error in data.frame(Word = names(one_corpus_num), frequency = one_corpus_num): object 'one_corpus_num' not found
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
## Error in eval(expr, envir, enclos): object 'one_corpus_table' not found
head(one_corpus_sort)
## Error in head(one_corpus_sort): object 'one_corpus_sort' not found
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
## Error in as.matrix(two_table[two_corpus, ]): object 'two_table' not found
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
## Error in data.frame(Word = names(two_corpus_num), frequency = two_corpus_num): object 'two_corpus_num' not found
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
## Error in eval(expr, envir, enclos): object 'two_corpus_table' not found
head(two_corpus_sort)
## Error in head(two_corpus_sort): object 'two_corpus_sort' not found
thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
## Error in as.matrix(thr_table[thr_corpus, ]): object 'thr_table' not found
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
## Error in data.frame(Word = names(thr_corpus_num), frequency = thr_corpus_num): object 'thr_corpus_num' not found
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
## Error in eval(expr, envir, enclos): object 'thr_corpus_table' not found
head(thr_corpus_sort)
## Error in head(thr_corpus_sort): object 'thr_corpus_sort' not found

Extract the word and frequency of N-grams.

Plot graph

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
## Error in ggplot(one_corpus_sort[1:10, ], aes(x = reorder(Word, -frequency), : object 'one_corpus_sort' not found
one_g<-one_g+geom_bar(stat="identity")
## Error in eval(expr, envir, enclos): object 'one_g' not found
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
## Error in eval(expr, envir, enclos): object 'one_g' not found
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
## Error in eval(expr, envir, enclos): object 'one_g' not found
one_g
## Error in eval(expr, envir, enclos): object 'one_g' not found
two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
## Error in ggplot(two_corpus_sort[1:10, ], aes(x = reorder(Word, -frequency), : object 'two_corpus_sort' not found
two_g<-two_g+geom_bar(stat="identity")
## Error in eval(expr, envir, enclos): object 'two_g' not found
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
## Error in eval(expr, envir, enclos): object 'two_g' not found
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
## Error in eval(expr, envir, enclos): object 'two_g' not found
two_g
## Error in eval(expr, envir, enclos): object 'two_g' not found
thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
## Error in ggplot(thr_corpus_sort[1:10, ], aes(x = reorder(Word, -frequency), : object 'thr_corpus_sort' not found
thr_g<-thr_g+geom_bar(stat="identity")
## Error in eval(expr, envir, enclos): object 'thr_g' not found
thr_g<-thr_g+labs(title="Trigrams",x="Words",y="Frequency")
## Error in eval(expr, envir, enclos): object 'thr_g' not found
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90))
## Error in eval(expr, envir, enclos): object 'thr_g' not found
thr_g
## Error in eval(expr, envir, enclos): object 'thr_g' not found

Plot graphs of each N-gram words. I can confirm which word is the most frequency in those files.

Next plans

I do analyze initially. Next, I will make a predictive algorithm, and using shiny() app, I will check the result which input is coming.