Introduction

The goal of this milestone is to build models for text predictions using Natural Languages Techniques. For this analyst it will be used large data text corpus as data training

# first download the required pabckadges.
library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
# create a directory and download the required datasets 

if(!file.exists("./datasets")){dir.create("./datasets")}

fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip" 

if(!file.exists("./datasets/Coursera-SwiftKey.zip")){
   download.file(fileUrl,destfile="./datasets/Coursera-SwiftKey.zip",mode = "wb")
}

# Unzip the files

if(!file.exists("./datasets/final")){
    unzip(zipfile="./datasets/Coursera-SwiftKey.zip",exdir="./datasets")
}

After downloading the required datasets check to see if the files are loaded

# This is the way to view the content of a folder
list.files("./final/en_US")
## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

Now we see that the folder contain 3 text files which are “en_US.blogs.txt”, “en.US.news.txt”, “en.US.twitter.txt” . We will start getting to know the types of contents by reading its lines. Therefore, we will be reading line by line.

blogs <- readLines("./final/en_US/en_US.blogs.txt", encoding="UTF-8")

twitter <- readLines("./final/en_US/en_US.twitter.txt", encoding="UTF-8")
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 167155 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 268547 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 1274086 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 1759032 appears to contain an embedded nul
con <- file("./final/en_US/en_US.news.txt", open = "rb")
news <- readLines(con, encoding="UTF-8")
close(con)
rm(con)

# Now we check the size of the three files in megabytes
blog_file_Size <- file.info("./final/en_US/en_US.blogs.txt")$size /1024^2

news_file_Size <- file.info("./final/en_US/en_US.news.txt")$size /1024^2

twitter_file_Size <- file.info("./final/en_US/en_US.twitter.txt")$size /1024^2


library(stringi)
stri_stats_general(blogs)
##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   206824382   170389539
stri_stats_general(news)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     1010242     1010242   203223154   169860866
stri_stats_general(twitter)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162096031   134082634
# Now the words counts

words_blogs <-stri_count_words(blogs)
words_twitter <-stri_count_words(twitter)
words_news <-stri_count_words(news)


# Now the summary of our findings

dataset_view <- data.frame(source = c("blogs", "news", "twitter"),
          file.size.MB = c(blog_file_Size, news_file_Size, twitter_file_Size),
         num.lines = c(length(blogs), length(news), length(twitter)),
          num.words = c(sum(words_blogs), sum(words_news), sum(words_twitter)),
         mean.num.words = c(mean(words_blogs), mean(words_news), mean(words_twitter)))

print(dataset_view)
##    source file.size.MB num.lines num.words mean.num.words
## 1   blogs     200.4242    899288  37546246       41.75108
## 2    news     196.2775   1010242  34762395       34.40997
## 3 twitter     159.3641   2360148  30093369       12.75063

Now we can see the results for summary of content of the files, now we can add some plots to have a view on how the data looks like

library(tm)
set.seed(679)

# Create corpus and clean the data

library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.2
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
##annotate
options(mc.cores=1)