0.00 Install Packages: Not all are use but this is a standard
list
library(tm)
## Loading required package: NLP
library(XML)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Loading required package: lattice
library(NLP)
library(openNLP)
library(RWeka)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following objects are masked from 'package:base':
##
## Filter, proportions
library(ggplot2)
library(stringi)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdapTools':
##
## id
## The following object is masked from 'package:qdapRegex':
##
## explain
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
0.01 Create a connection to the training data
eng_twitter <- file("/Users/niteshchampaneri/Desktop/00. Date Science/1. Data Science Specialization/10. Data Science Capstone/1. Data/final/en_US/en_US.twitter.txt", open = "rb")
eng_blogs <- file("/Users/niteshchampaneri/Desktop/00. Date Science/1. Data Science Specialization/10. Data Science Capstone/1. Data/final/en_US/en_US.blogs.txt", open = "rb")
eng_news <- file("/Users/niteshchampaneri/Desktop/00. Date Science/1. Data Science Specialization/10. Data Science Capstone/1. Data/final/en_US/en_US.news.txt", open = "rb")
1.0 Read the line in the data set and show how many lines of data
each data set contains
twit_lines <- readLines(eng_twitter)
blog_lines <- readLines(eng_blogs)
news_lines <- readLines(eng_news)
close(eng_twitter)
close(eng_blogs)
close(eng_news)
summary(nchar(twit_lines))[6]
## Max.
## 140
summary(nchar(blog_lines))[6]
## Max.
## 40833
summary(nchar(news_lines))[6]
## Max.
## 11384
1.1 Create a function to calc the number of characters in a
sentence, removing spaces
char <- function(x){stri_length(x) - stri_count_fixed(x," ")}
Test <- char("This is a test, the answer should be 31")
1.2 Create a summary of the data sets
1.2.1 Add the filesizes to the summary
filesummary <- data.frame(Source=c("Blogs", "News", "Twitter"),
FileSize_MB= c(format(structure(object.size(blog_lines), class="object_size"), units= "auto"),
format(structure(object.size(twit_lines), class="object_size"), units= "auto"),
format(structure(object.size(news_lines), class="object_size"), units= "auto")))
1.2.2 Add in number lines, words and characters to the summary
filesummary<-data.frame(Source=c("Blogs","News","Twitter"),
FileSize_MB=c(format(structure(object.size(blog_lines),
class="object_size"), units="auto"),
format(structure(object.size(news_lines),
class="object_size"), units="auto"),
format(structure(object.size(twit_lines), class="object_size"), units="auto") ),
Lines=c(length(blog_lines),length(news_lines),length(twit_lines)),
Words=c(sum(stri_count_words(blog_lines)),sum(stri_count_words(news_lines)),sum(stri_count_words(twit_lines))),
Characters=c(sum(char(blog_lines)),sum(char(news_lines)),sum(char(twit_lines))))
1.2.3 Add in average words per line and characters per line to
summary
filesummary<-mutate(filesummary,Words_Per_Line=Words/Lines,Char_Per_Line=round(Characters/Lines,1),Char_Per_Word=round(Characters/Words,2))
print(filesummary)
## Source FileSize_MB Lines Words Characters Words_Per_Line Char_Per_Line
## 1 Blogs 255.4 Mb 899288 37546250 170389662 41.75109 189.5
## 2 News 257.3 Mb 1010242 34762395 169860871 34.40997 168.1
## 3 Twitter 319 Mb 2360148 30093372 134082634 12.75063 56.8
## Char_Per_Word
## 1 4.54
## 2 4.89
## 3 4.46
2.0 Create a set of 1000 sample data batches from each of the three
data sets.
sampleBlogs <- sample(blog_lines,1000)
sampleNews <- sample(news_lines,1000)
sampleTwitter <- sample(twit_lines,1000)
sample <- c(sampleBlogs,sampleNews,sampleTwitter)
txt <- sent_detect(sample)
remove(sampleBlogs,sampleNews,sampleTwitter,blog_lines,news_lines,twit_lines,sample)
2.1 Remove Numbers, Punctuation etc to leave just characters
txt <- removeNumbers(txt) #Remove Numbers
txt <- removePunctuation(txt) #Remove Punctuation
txt <- stripWhitespace(txt) #Remove extra whites spaces
txt <- tolower(txt) #Change all text to lower case
txt <- txt[which(txt!="")]
txt <- data.frame(txt,stringsAsFactors = FALSE)
2.2 Create 1-grams, 2-grams and 3-grams
words<-WordTokenizer(txt)
grams<-NGramTokenizer(txt)
for(i in 1:length(grams))
{if(length(WordTokenizer(grams[i]))==2) break}
for(j in 1:length(grams))
{if(length(WordTokenizer(grams[j]))==1) break}
onegrams <- data.frame(table(words))
onegrams <- onegrams[order(onegrams$Freq, decreasing = TRUE),]
bigrams <- data.frame(table(grams[i:(j-1)]))
bigrams <- bigrams[order(bigrams$Freq, decreasing = TRUE),]
trigrams <- data.frame(table(grams[1:(i-1)]))
trigrams <- trigrams[order(trigrams$Freq, decreasing = TRUE),]
remove(i,j,grams)
2.3 Create Plots
onegrams_plot <- barplot(onegrams[1:20,2], col="green",
names.arg = onegrams$Var1[1:20],
main = "One Grams Plot",
ylab = "Count")

bigrams_plot <- barplot(bigrams[1:20,2], col="blue",
names.arg = bigrams$Var1[1:20],
main = "Bi Grams Plot",
ylab = "Count")

trigrams_plot <- barplot(bigrams[1:20,2], col="yellow",
names.arg = trigrams$Var1[1:20],
main = "Tri Grams Plot",
ylab = "Count")
