0.00 Install Packages: Not all are use but this is a standard list

library(tm)

## Loading required package: NLP

library(XML)
library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)
library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## Loading required package: lattice

library(NLP)
library(openNLP)
library(RWeka)
library(qdap)

## Loading required package: qdapDictionaries

## Loading required package: qdapRegex

## 
## Attaching package: 'qdapRegex'

## The following object is masked from 'package:ggplot2':
## 
##     %+%

## Loading required package: qdapTools

## 
## Attaching package: 'qdap'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following objects are masked from 'package:base':
## 
##     Filter, proportions

library(ggplot2)
library(stringi) 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:qdapTools':
## 
##     id

## The following object is masked from 'package:qdapRegex':
## 
##     explain

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

0.01 Create a connection to the training data

eng_twitter <- file("/Users/niteshchampaneri/Desktop/00. Date Science/1. Data Science Specialization/10. Data Science Capstone/1. Data/final/en_US/en_US.twitter.txt", open = "rb")
eng_blogs <- file("/Users/niteshchampaneri/Desktop/00. Date Science/1. Data Science Specialization/10. Data Science Capstone/1. Data/final/en_US/en_US.blogs.txt", open = "rb")
eng_news <- file("/Users/niteshchampaneri/Desktop/00. Date Science/1. Data Science Specialization/10. Data Science Capstone/1. Data/final/en_US/en_US.news.txt", open = "rb")

1.0 Read the line in the data set and show how many lines of data each data set contains

twit_lines <- readLines(eng_twitter)
blog_lines <- readLines(eng_blogs)
news_lines <- readLines(eng_news)

close(eng_twitter)
close(eng_blogs)
close(eng_news)

summary(nchar(twit_lines))[6]

## Max. 
##  140

summary(nchar(blog_lines))[6]

##  Max. 
## 40833

summary(nchar(news_lines))[6]

##  Max. 
## 11384

1.1 Create a function to calc the number of characters in a sentence, removing spaces

char <- function(x){stri_length(x) - stri_count_fixed(x," ")}

Test <- char("This is a test, the answer should be 31")

1.2 Create a summary of the data sets

1.2.1 Add the filesizes to the summary

filesummary <- data.frame(Source=c("Blogs", "News", "Twitter"),
                          FileSize_MB= c(format(structure(object.size(blog_lines), class="object_size"), units= "auto"),
                                                format(structure(object.size(twit_lines), class="object_size"), units= "auto"),
                                                format(structure(object.size(news_lines), class="object_size"), units= "auto")))

1.2.2 Add in number lines, words and characters to the summary

filesummary<-data.frame(Source=c("Blogs","News","Twitter"), 
                        FileSize_MB=c(format(structure(object.size(blog_lines), 
                                                       class="object_size"), units="auto"),
                                      format(structure(object.size(news_lines), 
                                                       class="object_size"), units="auto"),
                                      format(structure(object.size(twit_lines), class="object_size"), units="auto") ),
                        Lines=c(length(blog_lines),length(news_lines),length(twit_lines)),
                        Words=c(sum(stri_count_words(blog_lines)),sum(stri_count_words(news_lines)),sum(stri_count_words(twit_lines))),
                        Characters=c(sum(char(blog_lines)),sum(char(news_lines)),sum(char(twit_lines))))

1.2.3 Add in average words per line and characters per line to summary

filesummary<-mutate(filesummary,Words_Per_Line=Words/Lines,Char_Per_Line=round(Characters/Lines,1),Char_Per_Word=round(Characters/Words,2))

print(filesummary)

##    Source FileSize_MB   Lines    Words Characters Words_Per_Line Char_Per_Line
## 1   Blogs    255.4 Mb  899288 37546250  170389662       41.75109         189.5
## 2    News    257.3 Mb 1010242 34762395  169860871       34.40997         168.1
## 3 Twitter      319 Mb 2360148 30093372  134082634       12.75063          56.8
##   Char_Per_Word
## 1          4.54
## 2          4.89
## 3          4.46

2.0 Create a set of 1000 sample data batches from each of the three data sets.

sampleBlogs <- sample(blog_lines,1000)
sampleNews <- sample(news_lines,1000)
sampleTwitter <- sample(twit_lines,1000)
sample <- c(sampleBlogs,sampleNews,sampleTwitter)
txt <- sent_detect(sample)
remove(sampleBlogs,sampleNews,sampleTwitter,blog_lines,news_lines,twit_lines,sample)

2.1 Remove Numbers, Punctuation etc to leave just characters

txt <- removeNumbers(txt) #Remove Numbers
txt <- removePunctuation(txt) #Remove Punctuation
txt <- stripWhitespace(txt) #Remove extra whites spaces
txt <- tolower(txt) #Change all text to lower case
txt <- txt[which(txt!="")]
txt <- data.frame(txt,stringsAsFactors = FALSE)

2.2 Create 1-grams, 2-grams and 3-grams

words<-WordTokenizer(txt) 
grams<-NGramTokenizer(txt)

for(i in 1:length(grams)) 
{if(length(WordTokenizer(grams[i]))==2) break}
for(j in 1:length(grams)) 
{if(length(WordTokenizer(grams[j]))==1) break}

onegrams <- data.frame(table(words))
onegrams <- onegrams[order(onegrams$Freq, decreasing = TRUE),]
bigrams <- data.frame(table(grams[i:(j-1)]))
bigrams <- bigrams[order(bigrams$Freq, decreasing = TRUE),]
trigrams <- data.frame(table(grams[1:(i-1)]))
trigrams <- trigrams[order(trigrams$Freq, decreasing = TRUE),]
remove(i,j,grams)

2.3 Create Plots

onegrams_plot <- barplot(onegrams[1:20,2], col="green",
                         names.arg = onegrams$Var1[1:20],
                         main = "One Grams Plot",
                         ylab = "Count")

bigrams_plot <- barplot(bigrams[1:20,2], col="blue",
                        names.arg = bigrams$Var1[1:20],
                        main = "Bi Grams Plot",
                        ylab = "Count")

trigrams_plot <- barplot(bigrams[1:20,2], col="yellow",
                        names.arg = trigrams$Var1[1:20],
                        main = "Tri Grams Plot",
                        ylab = "Count")

Milestone Report

Nitesh Champaneri

2022-07-29

0.00 Install Packages: Not all are use but this is a standard list

0.01 Create a connection to the training data

1.0 Read the line in the data set and show how many lines of data each data set contains

1.1 Create a function to calc the number of characters in a sentence, removing spaces

1.2 Create a summary of the data sets

1.2.1 Add the filesizes to the summary

1.2.2 Add in number lines, words and characters to the summary

1.2.3 Add in average words per line and characters per line to summary

2.0 Create a set of 1000 sample data batches from each of the three data sets.

2.1 Remove Numbers, Punctuation etc to leave just characters

2.2 Create 1-grams, 2-grams and 3-grams

2.3 Create Plots