Installing libraries.
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(rmarkdown)
Setting File Paths.
blog_path<-"F:/Data science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
twitter_path<-"F:/Data science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
news_path<-"F:/Data science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt"
Converting Files to Vector Data structures.
textTOvector<-function(path){
con<-file(path,"r")
return(readLines(con,-1))
}
newsData<-textTOvector(news_path)
## Warning in readLines(con, -1): incomplete final line found on 'F:/Data
## science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/
## en_US.news.txt'
twitterData<-textTOvector(twitter_path)
## Warning in readLines(con, -1): line 167155 appears to contain an embedded
## nul
## Warning in readLines(con, -1): line 268547 appears to contain an embedded
## nul
## Warning in readLines(con, -1): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(con, -1): line 1759032 appears to contain an embedded
## nul
blogData<-textTOvector(blog_path)
Cleaning Data.
TextTOClean<-function(textData){
textData<-gsub(";|\\.|!|\\?",x=textData,replacement = "rep1")
textData<-gsub("\\'",x=textData,replacement = "rep2")
textData<-gsub("[^a-zA-Z]",x=textData,replacement = " ")
textData<-tolower(textData)
textData<-gsub("rep2",x=textData,replacement = "'")
textData<-gsub("\\s+",x=textData,replacement = " ")
sentence_vector<-unlist(strsplit(x=textData,split ="rep1",fixed = T))
return(sentence_vector)
}
corpus_blog<-TextTOClean(blogData)
corpus_tweet<-TextTOClean(twitterData)
corpus_news<-TextTOClean(newsData)
For future use.
saveRDS(corpus_blog,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\blogs.txt")
saveRDS(corpus_tweet,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\twitter.txt")
saveRDS(corpus_news,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\news.txt")
#Creating Corpora of the cleaned files.
docs<-Corpus(DirSource("F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean"))
Exploring
LinesInFile<-function(filepath){
con<-file(filepath,"r")
Numberlines<-0
while(TRUE){
line<-readLines(con,n=1)
if((length(line)==0)){
break
}
Numberlines<-Numberlines+1
}
close(con)
return(Numberlines)
}
blogLines<-LinesInFile(blog_path)
newsLines<-LinesInFile(news_path)
## Warning in readLines(con, n = 1): incomplete final line found on 'F:/Data
## science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/
## en_US.news.txt'
twitterLines<-LinesInFile(twitter_path)
## Warning in readLines(con, n = 1): line 1 appears to contain an embedded nul
## Warning in readLines(con, n = 1): line 1 appears to contain an embedded nul
## Warning in readLines(con, n = 1): line 1 appears to contain an embedded nul
## Warning in readLines(con, n = 1): line 1 appears to contain an embedded nul
paste("blog lines",blogLines)
## [1] "blog lines 899288"
paste("news Lines",newsLines)
## [1] "news Lines 77259"
paste("twitter Lines",twitterLines)
## [1] "twitter Lines 2360148"
Frequency of Words.
dtm<-DocumentTermMatrix(docs)
frequencyMonogram<-colSums(as.matrix(dtm))
frequencyMonogram<-sort(frequencyMonogram,decreasing = T)
head(frequencyMonogram,20)
## the and that you rep for was with this have
## 19176 11010 4762 4127 3924 3672 3001 2832 2456 2119
## but thank are not yourep they all from good what
## 2113 2099 1939 1904 1737 1508 1456 1439 1428 1314
Making a WordCloud.
wordcloud(names(frequencyMonogram),frequencyMonogram,max.words = 200,colors=brewer.pal(8,"Dark2"),rot.per = .3)
