sampletext <- function(textbody, portion) {
taking <- sample(1:length(textbody), length(textbody)*portion)
Sampletext <- textbody[taking]
Sampletext
}
set.seed(1103)
###As the test files are large we need to create a samples from each data sets
### This will improve processing time
portion <- 25/50
SampleTwitter <- sampletext(twitter, portion)
SampleBlog <- sampletext(blogs, portion)
SampleNews <- sampletext(news, portion)
### combine sampled texts into one variable
SampleAll <- c(SampleBlog, SampleNews, SampleTwitter)
### write sampled texts into text files for further analysis
writeLines(SampleAll, "SampleAll.txt")
theSampleCon <- file("SampleAll.txt")
theSample <- readLines(theSampleCon)
## Warning in readLines(theSampleCon): incomplete final line found on
## 'SampleAll.txt'
close(theSampleCon)
# Checking the size and length of the files and calculate the word count
blogsFile <- file.info("./final/en_US/en_US.blogs.txt")$size / 1024.0 / 1024.0
newsFile <- file.info("./final/en_US/en_US.news.txt")$size / 1024.0 / 1024.0
twitterFile <- file.info("./final/en_US/en_US.twitter.txt")$size / 1024.0 / 1024.0
sampleFile <- file.info("./MilestoneReport/textSample.txt")$size / 1024.0 / 1024.0
blogsLength <- length(blogs)
newsLength <- length(news)
twitterLength <- length(twitter)
sampleLength <- length(theSample)
blogsWords <- sum(sapply(gregexpr("\\S+", blogs), length))
newsWords <- sum(sapply(gregexpr("\\S+", news), length))
twitterWords <- sum(sapply(gregexpr("\\S+", twitter), length))
sampleWords <- sum(sapply(gregexpr("\\S+", theSample), length))
### words per line
library(stringi)
## Warning: package 'stringi' was built under R version 3.6.2
wpl <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))
## Analysing
blogsWords
## [1] 37334131
newsWords
## [1] 34372530
twitterWords
## [1] 30373583
library(stringi)
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'gridExtra' was built under R version 3.6.3