# first download the required pabckadges.
library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
# create a directory and download the required datasets
if(!file.exists("./datasets")){dir.create("./datasets")}
fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("./datasets/Coursera-SwiftKey.zip")){
download.file(fileUrl,destfile="./datasets/Coursera-SwiftKey.zip",mode = "wb")
}
# Unzip the files
if(!file.exists("./datasets/final")){
unzip(zipfile="./datasets/Coursera-SwiftKey.zip",exdir="./datasets")
}
# This is the way to view the content of a folder
list.files("./final/en_US")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
blogs <- readLines("./final/en_US/en_US.blogs.txt", encoding="UTF-8")
twitter <- readLines("./final/en_US/en_US.twitter.txt", encoding="UTF-8")
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 167155 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 268547 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 1274086 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 1759032 appears to contain an embedded nul
con <- file("./final/en_US/en_US.news.txt", open = "rb")
news <- readLines(con, encoding="UTF-8")
close(con)
rm(con)
# Now we check the size of the three files in megabytes
blog_file_Size <- file.info("./final/en_US/en_US.blogs.txt")$size /1024^2
news_file_Size <- file.info("./final/en_US/en_US.news.txt")$size /1024^2
twitter_file_Size <- file.info("./final/en_US/en_US.twitter.txt")$size /1024^2
library(stringi)
stri_stats_general(blogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
stri_stats_general(news)
## Lines LinesNEmpty Chars CharsNWhite
## 1010242 1010242 203223154 169860866
stri_stats_general(twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096031 134082634
# Now the words counts
words_blogs <-stri_count_words(blogs)
words_twitter <-stri_count_words(twitter)
words_news <-stri_count_words(news)
# Now the summary of our findings
dataset_view <- data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(blog_file_Size, news_file_Size, twitter_file_Size),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(words_blogs), sum(words_news), sum(words_twitter)),
mean.num.words = c(mean(words_blogs), mean(words_news), mean(words_twitter)))
print(dataset_view)
## source file.size.MB num.lines num.words mean.num.words
## 1 blogs 200.4242 899288 37546246 41.75108
## 2 news 196.2775 1010242 34762395 34.40997
## 3 twitter 159.3641 2360148 30093369 12.75063
library(tm)
set.seed(679)
# Create corpus and clean the data
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.2
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##annotate
options(mc.cores=1)