The goal of this milestone is to build models for text predictions using Natural Languages Techniques. For this analyst it will be used large data text corpus as data training
# first download the required pabckadges.
library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
require(ggplot2)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
require(RWeka)
## Loading required package: RWeka
## Warning: package 'RWeka' was built under R version 3.4.2
# create a directory and download the required datasets
if(!file.exists("./datasets")){dir.create("./datasets")}
fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("./datasets/Coursera-SwiftKey.zip")){
download.file(fileUrl,destfile="./datasets/Coursera-SwiftKey.zip",mode = "wb")
}
# Unzip the files
if(!file.exists("./datasets/final")){
unzip(zipfile="./datasets/Coursera-SwiftKey.zip",exdir="./datasets")
}
list.files("./final/en_US")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
Now that we have all three files loaded it is time to sample the files for our study.
blogs <- readLines("./final/en_US/en_US.blogs.txt")
news <- readLines("./final/en_US/en_US.news.txt")
## Warning in readLines("./final/en_US/en_US.news.txt"): incomplete final line
## found on './final/en_US/en_US.news.txt'
twitter <- readLines("./final/en_US/en_US.twitter.txt")
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 167155
## appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 268547
## appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1274086
## appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1759032
## appears to contain an embedded nul
# Now we check the size of the three files in megabytes
blog_file_Size <- file.info("./final/en_US/en_US.blogs.txt")$size /1024^2
news_file_Size <- file.info("./final/en_US/en_US.news.txt")$size /1024^2
twitter_file_Size <- file.info("./final/en_US/en_US.twitter.txt")$size /1024^2
print(paste("News Data Length = ", length(news),
", News Blog Length = ", length(blogs),
", News twitter Length = ", length(twitter)
))
## [1] "News Data Length = 77259 , News Blog Length = 899288 , News twitter Length = 2360148"
library(tm)
library(stringi)
stri_stats_general(blogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 208361438 171926076
stri_stats_general(news)
## Lines LinesNEmpty Chars CharsNWhite
## 77259 77259 15683765 13117038
stri_stats_general(twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162384825 134370864
# Now the words counts
words_blogs <-stri_count_words(blogs)
words_twitter <-stri_count_words(twitter)
words_news <-stri_count_words(news)
# Now the summary of findings
dataset_view <- data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(blog_file_Size, news_file_Size, twitter_file_Size),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(words_blogs), sum(words_news), sum(words_twitter)),
mean.num.words = c(mean(words_blogs), mean(words_news), mean(words_twitter)))
print(dataset_view)
## source file.size.MB num.lines num.words mean.num.words
## 1 blogs 200.4242 899288 38154238 42.42716
## 2 news 196.2775 77259 2693898 34.86840
## 3 twitter 159.3641 2360148 30218125 12.80349
filleSamples <- paste(news[1:10000],blogs[1:10000],twitter[1:10000])
newDataSets <- VCorpus(VectorSource(filleSamples))
#rm(filleSamples)