0.00 Install Packages: Not all are use but this is a standard list

library(tm)
## Loading required package: NLP
library(XML)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## Loading required package: lattice
library(NLP)
library(openNLP)
library(RWeka)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## 
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
## 
##     %+%
## Loading required package: qdapTools
## 
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following objects are masked from 'package:base':
## 
##     Filter, proportions
library(ggplot2)
library(stringi) 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdapTools':
## 
##     id
## The following object is masked from 'package:qdapRegex':
## 
##     explain
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

0.01 Create a connection to the training data

eng_twitter <- file("/Users/niteshchampaneri/Desktop/00. Date Science/1. Data Science Specialization/10. Data Science Capstone/1. Data/final/en_US/en_US.twitter.txt", open = "rb")
eng_blogs <- file("/Users/niteshchampaneri/Desktop/00. Date Science/1. Data Science Specialization/10. Data Science Capstone/1. Data/final/en_US/en_US.blogs.txt", open = "rb")
eng_news <- file("/Users/niteshchampaneri/Desktop/00. Date Science/1. Data Science Specialization/10. Data Science Capstone/1. Data/final/en_US/en_US.news.txt", open = "rb")

1.0 Read the line in the data set and show how many lines of data each data set contains

twit_lines <- readLines(eng_twitter)
blog_lines <- readLines(eng_blogs)
news_lines <- readLines(eng_news)

close(eng_twitter)
close(eng_blogs)
close(eng_news)

summary(nchar(twit_lines))[6]
## Max. 
##  140
summary(nchar(blog_lines))[6]
##  Max. 
## 40833
summary(nchar(news_lines))[6]
##  Max. 
## 11384

1.1 Create a function to calc the number of characters in a sentence, removing spaces

char <- function(x){stri_length(x) - stri_count_fixed(x," ")}

Test <- char("This is a test, the answer should be 31")

1.2 Create a summary of the data sets

1.2.1 Add the filesizes to the summary

filesummary <- data.frame(Source=c("Blogs", "News", "Twitter"),
                          FileSize_MB= c(format(structure(object.size(blog_lines), class="object_size"), units= "auto"),
                                                format(structure(object.size(twit_lines), class="object_size"), units= "auto"),
                                                format(structure(object.size(news_lines), class="object_size"), units= "auto")))

1.2.2 Add in number lines, words and characters to the summary

filesummary<-data.frame(Source=c("Blogs","News","Twitter"), 
                        FileSize_MB=c(format(structure(object.size(blog_lines), 
                                                       class="object_size"), units="auto"),
                                      format(structure(object.size(news_lines), 
                                                       class="object_size"), units="auto"),
                                      format(structure(object.size(twit_lines), class="object_size"), units="auto") ),
                        Lines=c(length(blog_lines),length(news_lines),length(twit_lines)),
                        Words=c(sum(stri_count_words(blog_lines)),sum(stri_count_words(news_lines)),sum(stri_count_words(twit_lines))),
                        Characters=c(sum(char(blog_lines)),sum(char(news_lines)),sum(char(twit_lines))))

1.2.3 Add in average words per line and characters per line to summary

filesummary<-mutate(filesummary,Words_Per_Line=Words/Lines,Char_Per_Line=round(Characters/Lines,1),Char_Per_Word=round(Characters/Words,2))

print(filesummary)
##    Source FileSize_MB   Lines    Words Characters Words_Per_Line Char_Per_Line
## 1   Blogs    255.4 Mb  899288 37546250  170389662       41.75109         189.5
## 2    News    257.3 Mb 1010242 34762395  169860871       34.40997         168.1
## 3 Twitter      319 Mb 2360148 30093372  134082634       12.75063          56.8
##   Char_Per_Word
## 1          4.54
## 2          4.89
## 3          4.46

2.0 Create a set of 1000 sample data batches from each of the three data sets.

sampleBlogs <- sample(blog_lines,1000)
sampleNews <- sample(news_lines,1000)
sampleTwitter <- sample(twit_lines,1000)
sample <- c(sampleBlogs,sampleNews,sampleTwitter)
txt <- sent_detect(sample)
remove(sampleBlogs,sampleNews,sampleTwitter,blog_lines,news_lines,twit_lines,sample)

2.1 Remove Numbers, Punctuation etc to leave just characters

txt <- removeNumbers(txt) #Remove Numbers
txt <- removePunctuation(txt) #Remove Punctuation
txt <- stripWhitespace(txt) #Remove extra whites spaces
txt <- tolower(txt) #Change all text to lower case
txt <- txt[which(txt!="")]
txt <- data.frame(txt,stringsAsFactors = FALSE)

2.2 Create 1-grams, 2-grams and 3-grams

words<-WordTokenizer(txt) 
grams<-NGramTokenizer(txt)

for(i in 1:length(grams)) 
{if(length(WordTokenizer(grams[i]))==2) break}
for(j in 1:length(grams)) 
{if(length(WordTokenizer(grams[j]))==1) break}

onegrams <- data.frame(table(words))
onegrams <- onegrams[order(onegrams$Freq, decreasing = TRUE),]
bigrams <- data.frame(table(grams[i:(j-1)]))
bigrams <- bigrams[order(bigrams$Freq, decreasing = TRUE),]
trigrams <- data.frame(table(grams[1:(i-1)]))
trigrams <- trigrams[order(trigrams$Freq, decreasing = TRUE),]
remove(i,j,grams)

2.3 Create Plots

onegrams_plot <- barplot(onegrams[1:20,2], col="green",
                         names.arg = onegrams$Var1[1:20],
                         main = "One Grams Plot",
                         ylab = "Count")

bigrams_plot <- barplot(bigrams[1:20,2], col="blue",
                        names.arg = bigrams$Var1[1:20],
                        main = "Bi Grams Plot",
                        ylab = "Count")

trigrams_plot <- barplot(bigrams[1:20,2], col="yellow",
                        names.arg = trigrams$Var1[1:20],
                        main = "Tri Grams Plot",
                        ylab = "Count")