Installing libraries.

library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(rmarkdown)

Setting File Paths.

blog_path<-"F:/Data science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
twitter_path<-"F:/Data science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
news_path<-"F:/Data science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt"

Converting Files to Vector Data structures.

textTOvector<-function(path){
  con<-file(path,"r")
  return(readLines(con,-1))
}
newsData<-textTOvector(news_path)
## Warning in readLines(con, -1): incomplete final line found on 'F:/Data
## science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/
## en_US.news.txt'
twitterData<-textTOvector(twitter_path)
## Warning in readLines(con, -1): line 167155 appears to contain an embedded
## nul
## Warning in readLines(con, -1): line 268547 appears to contain an embedded
## nul
## Warning in readLines(con, -1): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(con, -1): line 1759032 appears to contain an embedded
## nul
blogData<-textTOvector(blog_path)

Cleaning Data.

TextTOClean<-function(textData){
  textData<-gsub(";|\\.|!|\\?",x=textData,replacement = "rep1")
  textData<-gsub("\\'",x=textData,replacement = "rep2")
  textData<-gsub("[^a-zA-Z]",x=textData,replacement = " ")
  
  textData<-tolower(textData)
  
  textData<-gsub("rep2",x=textData,replacement = "'")
  textData<-gsub("\\s+",x=textData,replacement = " ")
  
  sentence_vector<-unlist(strsplit(x=textData,split ="rep1",fixed = T))
  return(sentence_vector)
}

corpus_blog<-TextTOClean(blogData)
corpus_tweet<-TextTOClean(twitterData)
corpus_news<-TextTOClean(newsData)

For future use.

saveRDS(corpus_blog,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\blogs.txt")
saveRDS(corpus_tweet,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\twitter.txt")
saveRDS(corpus_news,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\news.txt")

#Creating Corpora of the cleaned files.
docs<-Corpus(DirSource("F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean"))

Exploring

LinesInFile<-function(filepath){
  con<-file(filepath,"r")
  Numberlines<-0
  while(TRUE){
    line<-readLines(con,n=1)
    if((length(line)==0)){
      break
    }
    Numberlines<-Numberlines+1
  }
  close(con)
  return(Numberlines)
}

blogLines<-LinesInFile(blog_path)
newsLines<-LinesInFile(news_path)
## Warning in readLines(con, n = 1): incomplete final line found on 'F:/Data
## science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/
## en_US.news.txt'
twitterLines<-LinesInFile(twitter_path)
## Warning in readLines(con, n = 1): line 1 appears to contain an embedded nul
## Warning in readLines(con, n = 1): line 1 appears to contain an embedded nul

## Warning in readLines(con, n = 1): line 1 appears to contain an embedded nul

## Warning in readLines(con, n = 1): line 1 appears to contain an embedded nul
paste("blog lines",blogLines)
## [1] "blog lines 899288"
paste("news Lines",newsLines)
## [1] "news Lines 77259"
paste("twitter Lines",twitterLines)
## [1] "twitter Lines 2360148"

Frequency of Words.

dtm<-DocumentTermMatrix(docs)
frequencyMonogram<-colSums(as.matrix(dtm))
frequencyMonogram<-sort(frequencyMonogram,decreasing = T)
head(frequencyMonogram,20)
##    the    and   that    you    rep    for    was   with   this   have 
##  19176  11010   4762   4127   3924   3672   3001   2832   2456   2119 
##    but  thank    are    not yourep   they    all   from   good   what 
##   2113   2099   1939   1904   1737   1508   1456   1439   1428   1314

Making a WordCloud.

wordcloud(names(frequencyMonogram),frequencyMonogram,max.words = 200,colors=brewer.pal(8,"Dark2"),rot.per = .3)