Milestone Report Capstone

##Libraries and download data
#Required libraries
library(tm)

## Loading required package: NLP

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(stringi)
library(NLP)
library(tm)
library(magrittr)
library(SnowballC)

#Download data
setwd("C:/Users/aarp/Downloads")
getwd()

## [1] "C:/Users/aarp/Downloads"

fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if (!file.exists("Coursera-SwiftKey.zip")){
     download.file(fileUrl, destfile = "Coursera-SwiftKey.zip", method="curl")
     unzip("Coursera-SwiftKey.zip")
 }

#We will check the file size of the data
filesz1 <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
filesz2 <- file.info("final/en_US/en_US.news.txt")$size / 1024^2
filesz3 <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
fileSIZE <- rbind(filesz1, filesz2, filesz3)

#Load the text files found in folder
blogDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "rb")
blogs <- readLines(blogDirectory, encoding = "UTF-8",skipNul = TRUE)
close(blogDirectory)

newsDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt", "rb")
news <- readLines(newsDirectory, encoding = "UTF-8",skipNul = TRUE)
close(newsDirectory)

twitterDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "rb")
twitter <- readLines(twitterDirectory, encoding = "UTF-8",skipNul = TRUE)
close(twitterDirectory)

##Exploratory Data analysis
#Get the number of lines of each source of data

length(blogs)

## [1] 899288

length(twitter)

## [1] 2360148

length(news)

## [1] 1010242

#Get the number of words per line of data and present the summary

# Get number of words per line

str1 <- "blogs"
str1 <- "twitter"
str1 <- "news"

nwords.blogs <- stri_count_words(blogs)
nwords.twitter <- stri_count_words(twitter)
nwords.news <- stri_count_words(news)

###Data Summary
summary(nwords.blogs)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.75   60.00 6726.00

summary(nwords.twitter)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00

summary(nwords.news)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.41   46.00 1796.00

wordcount <- rbind(sum(nwords.blogs), sum(nwords.twitter), sum(nwords.news))

print(wordcount)

##          [,1]
## [1,] 37546246
## [2,] 30093410
## [3,] 34762395

###Exploration of a sample data
#I prepare a random sample of 0.05% of total lines provided by the source data

set.seed(10000)
s_blogs <- sample(blogs, length(blogs)*0.0005)
set.seed(10000)
s_news <- sample(news, length(news)*0.0005)
set.seed(10000)
s_twitter <- sample(twitter, length(twitter)*0.0005)

snwords.blogs <- stri_count_words(s_blogs)
snwords.news <- stri_count_words(s_news)
snwords.twitter <- stri_count_words(s_twitter)

df.nwords.all <- data.frame(nword = c(snwords.blogs, snwords.twitter, snwords.news),                             type = c(rep("blog", length(snwords.blogs)), rep("twitter",length(snwords.twitter)), rep("news", length(snwords.news)))
 )

#Plotting the density probability of the frequency of the number of words per line.

ggplot(data = df.nwords.all) + geom_density(aes(nword)) + facet_wrap(~type, nrow = 3) + xlim(0,500)

## Warning: Removed 1 rows containing non-finite values (stat_density).

#Create a corpus and clean it to see which words occur more often. I apply only for news source

newsCorpus = Corpus(VectorSource(s_news))
newsCorpus = tm_map(newsCorpus, content_transformer(tolower))
newsCorpus = tm_map(newsCorpus, removePunctuation)
newsCorpus = tm_map(newsCorpus, removeNumbers)
newsDTM = TermDocumentMatrix(newsCorpus,
                              control = list(minWordLength = 1))

mnews = as.matrix(newsDTM)
newsOrder <- sort(rowSums(mnews), decreasing = TRUE)

#Displaying which are the 10-top frequent words & 10-bottom words

head(newsOrder, 10)

##  the  and  for that said  was with  are  his  but 
##  952  448  179  168  119  110  104   86   78   66

tail(newsOrder, 10)

##       casa   chandler     grande   occurred       peak    picacho 
##          1          1          1          1          1          1 
##    typical  weren�t year�� �because 
##          1          1          1          1

I have performed an exploratory analysis. From the text mining for the dataset, we sampling the dataset to get the high frequency of occurence of words