setwd("D:/Project/Coursera-L10-Data-science-capstone-week2-master/final/en_US/")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
I set the directory and load 3 data.
size_blogs<-file.size(path="D:/1-1. R studio/Lecture10. Data science capstone/week2/final/en_US/en_US.blogs.txt")/2^20
size_news<-file.size(path="D:/1-1. R studio/Lecture10. Data science capstone/week2/final/en_US/en_US.news.txt")/2^20
size_twitter<-file.size(path="D:/1-1. R studio/Lecture10. Data science capstone/week2/final/en_US/en_US.twitter.txt")/2^20
len_blogs<-length(blogs)
len_news<-length(news)
len_twitter<-length(twitter)
nchar_blogs<-sum(nchar(blogs))
nchar_news<-sum(nchar(news))
nchar_twitter<-sum(nchar(twitter))
library(stringi)
nword_blogs<-stri_stats_latex(blogs)[4]
nword_news<-stri_stats_latex(news)[4]
nword_twitter<-stri_stats_latex(twitter)[4]
table<-data.frame("File Name"=c("Blogs","News","Twitter"),
"File Size(MB)"=c(size_blogs,size_news,size_twitter),
"Num of rows"=c(len_blogs,len_news,len_twitter),
"Num of character"=c(nchar_blogs,nchar_news,nchar_twitter),
"Num of words"=c(nword_blogs,nword_news,nword_twitter))
table
## File.Name File.Size.MB. Num.of.rows Num.of.character Num.of.words
## 1 Blogs NA 899288 206824505 37570839
## 2 News NA 77259 15639408 2651432
## 3 Twitter NA 2360148 162096031 30451128
Summarize the contents, which has file size, number of rows, number of character and number of words in each file. And make the table
set.seed(12345)
blogs1<-iconv(blogs,"latin1","ASCII",sub="")
news1<-iconv(news,"latin1","ASCII",sub="")
twitter1<-iconv(twitter,"latin1","ASCII",sub="")
rm(blogs)
rm(news)
rm(twitter)
# sample data set only 1% of each file
sample_data<-c(sample(blogs1,length(blogs1)*0.01),
sample(news1,length(news1)*0.01),
sample(twitter1,length(twitter1)*0.01))
rm(blogs1)
rm(news1)
rm(twitter1)
Data sets are really big, so using sample() function, I sample 1% of each file.
library(tm)
## Loading required package: NLP
library(NLP)
corpus<-VCorpus(VectorSource(sample_data))
corpus1<-tm_map(corpus,removePunctuation)
corpus2<-tm_map(corpus1,stripWhitespace)
corpus3<-tm_map(corpus2,tolower)
corpus4<-tm_map(corpus3,removeNumbers)
corpus5<-tm_map(corpus4,PlainTextDocument)
corpus6<-tm_map(corpus5,removeWords,stopwords("english"))
corpus_result<-data.frame(text=unlist(sapply(corpus6,'[',"content")),stringsAsFactors = FALSE)
head(corpus_result)
## text
## 1 put another way spirit sites mission bollocks
## 2 regrets youth
## 3 tom see
## 4 see fault evolution
## 5 seriously wells youngs bull crap selling uk next year get sorted want drinking christmas
## 6 answer pretty straightforward need muscle biopsy painful muscles yes know something looking forward however place can find virus will happy help find virus causing severe disabling disease mitochondria
rm(corpus)
rm(corpus1)
rm(corpus2)
rm(corpus3)
rm(corpus4)
rm(corpus5)
Build corpus, and check it making data frame.
library(RWeka)
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)
## Word frequency
## just just 2576
## like like 2218
## will will 2211
## one one 2049
## get get 1869
## can can 1866
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)
## Word frequency
## cant wait cant wait 208
## right now right now 206
## dont know dont know 164
## last night last night 148
## im going im going 130
## feel like feel like 125
thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
head(thr_corpus_sort)
## Word frequency
## cant wait see cant wait see 45
## happy mothers day happy mothers day 36
## happy new year happy new year 24
## im pretty sure im pretty sure 18
## italy lakes holidays italy lakes holidays 18
## little italy boston little italy boston 17
Extract the word and frequency of N-grams.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity")
one_g<-one_g+labs(title="Unigram Diagram",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
one_g
two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity")
two_g<-two_g+labs(title="Bigram Diagram",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
two_g
thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
thr_g<-thr_g+geom_bar(stat="identity")
thr_g<-thr_g+labs(title="Trigram Diagram",x="Words",y="Frequency")
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90))
thr_g
## Next plans Next, I will make a predictive algorithm, and using
shiny() app, I will check the result which input is coming.