Milestone Report Capstone

Hamid Junejo

08/07/2017

##Libraries and download data
#Required libraries
library(tm)
## Loading required package: NLP
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(stringi)
library(NLP)
library(tm)
library(magrittr)
library(SnowballC)

#Download data
setwd("C:/Users/aarp/Downloads")
getwd()
## [1] "C:/Users/aarp/Downloads"
fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if (!file.exists("Coursera-SwiftKey.zip")){
     download.file(fileUrl, destfile = "Coursera-SwiftKey.zip", method="curl")
     unzip("Coursera-SwiftKey.zip")
 }

#We will check the file size of the data
filesz1 <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
filesz2 <- file.info("final/en_US/en_US.news.txt")$size / 1024^2
filesz3 <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
fileSIZE <- rbind(filesz1, filesz2, filesz3)

#Load the text files found in folder
blogDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "rb")
blogs <- readLines(blogDirectory, encoding = "UTF-8",skipNul = TRUE)
close(blogDirectory)

newsDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt", "rb")
news <- readLines(newsDirectory, encoding = "UTF-8",skipNul = TRUE)
close(newsDirectory)

twitterDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "rb")
twitter <- readLines(twitterDirectory, encoding = "UTF-8",skipNul = TRUE)
close(twitterDirectory)

##Exploratory Data analysis
#Get the number of lines of each source of data

length(blogs)
## [1] 899288
length(twitter)
## [1] 2360148
length(news)
## [1] 1010242
#Get the number of words per line of data and present the summary

# Get number of words per line

str1 <- "blogs"
str1 <- "twitter"
str1 <- "news"

nwords.blogs <- stri_count_words(blogs)
nwords.twitter <- stri_count_words(twitter)
nwords.news <- stri_count_words(news)

###Data Summary
summary(nwords.blogs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.75   60.00 6726.00
summary(nwords.twitter)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00
summary(nwords.news)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.41   46.00 1796.00
wordcount <- rbind(sum(nwords.blogs), sum(nwords.twitter), sum(nwords.news))

print(wordcount)
##          [,1]
## [1,] 37546246
## [2,] 30093410
## [3,] 34762395
###Exploration of a sample data
#I prepare a random sample of 0.05% of total lines provided by the source data

set.seed(10000)
s_blogs <- sample(blogs, length(blogs)*0.0005)
set.seed(10000)
s_news <- sample(news, length(news)*0.0005)
set.seed(10000)
s_twitter <- sample(twitter, length(twitter)*0.0005)

snwords.blogs <- stri_count_words(s_blogs)
snwords.news <- stri_count_words(s_news)
snwords.twitter <- stri_count_words(s_twitter)

df.nwords.all <- data.frame(nword = c(snwords.blogs, snwords.twitter, snwords.news),                             type = c(rep("blog", length(snwords.blogs)), rep("twitter",length(snwords.twitter)), rep("news", length(snwords.news)))
 )

#Plotting the density probability of the frequency of the number of words per line.

ggplot(data = df.nwords.all) + geom_density(aes(nword)) + facet_wrap(~type, nrow = 3) + xlim(0,500)
## Warning: Removed 1 rows containing non-finite values (stat_density).
plot of chunk unnamed-chunk-1
#Create a corpus and clean it to see which words occur more often. I apply only for news source

newsCorpus = Corpus(VectorSource(s_news))
newsCorpus = tm_map(newsCorpus, content_transformer(tolower))
newsCorpus = tm_map(newsCorpus, removePunctuation)
newsCorpus = tm_map(newsCorpus, removeNumbers)
newsDTM = TermDocumentMatrix(newsCorpus,
                              control = list(minWordLength = 1))

mnews = as.matrix(newsDTM)
newsOrder <- sort(rowSums(mnews), decreasing = TRUE)

#Displaying which are the 10-top frequent words & 10-bottom words

head(newsOrder, 10)
##  the  and  for that said  was with  are  his  but 
##  952  448  179  168  119  110  104   86   78   66
tail(newsOrder, 10)
##       casa   chandler     grande   occurred       peak    picacho 
##          1          1          1          1          1          1 
##    typical  werenât yearââ âbecause 
##          1          1          1          1

Conclusions

I have performed an exploratory analysis. From the text mining for the dataset, we sampling the dataset to get the high frequency of occurence of words

After looking at the data there are some additional things

1 - More data cleansing and A good sampling

2Create a prediction model and Build an application.