Milestone Report Capstone
Hamid Junejo
08/07/2017
##Libraries and download data #Required libraries library(tm)
library(ggplot2)
library(stringi) library(NLP) library(tm) library(magrittr) library(SnowballC) #Download data setwd("C:/Users/aarp/Downloads") getwd()
## [1] "C:/Users/aarp/Downloads"
fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip" if (!file.exists("Coursera-SwiftKey.zip")){ download.file(fileUrl, destfile = "Coursera-SwiftKey.zip", method="curl") unzip("Coursera-SwiftKey.zip") } #We will check the file size of the data filesz1 <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2 filesz2 <- file.info("final/en_US/en_US.news.txt")$size / 1024^2 filesz3 <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2 fileSIZE <- rbind(filesz1, filesz2, filesz3) #Load the text files found in folder blogDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "rb") blogs <- readLines(blogDirectory, encoding = "UTF-8",skipNul = TRUE) close(blogDirectory) newsDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt", "rb") news <- readLines(newsDirectory, encoding = "UTF-8",skipNul = TRUE) close(newsDirectory) twitterDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "rb") twitter <- readLines(twitterDirectory, encoding = "UTF-8",skipNul = TRUE) close(twitterDirectory) ##Exploratory Data analysis #Get the number of lines of each source of data length(blogs)
## [1] 899288
length(twitter)
## [1] 2360148
length(news)
## [1] 1010242
#Get the number of words per line of data and present the summary # Get number of words per line str1 <- "blogs" str1 <- "twitter" str1 <- "news" nwords.blogs <- stri_count_words(blogs) nwords.twitter <- stri_count_words(twitter) nwords.news <- stri_count_words(news) ###Data Summary summary(nwords.blogs)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.00 9.00 28.00 41.75 60.00 6726.00
summary(nwords.twitter)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 1.00 7.00 12.00 12.75 18.00 47.00
summary(nwords.news)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 1.00 19.00 32.00 34.41 46.00 1796.00
wordcount <- rbind(sum(nwords.blogs), sum(nwords.twitter), sum(nwords.news)) print(wordcount)
## [,1] ## [1,] 37546246 ## [2,] 30093410 ## [3,] 34762395
###Exploration of a sample data #I prepare a random sample of 0.05% of total lines provided by the source data set.seed(10000) s_blogs <- sample(blogs, length(blogs)*0.0005) set.seed(10000) s_news <- sample(news, length(news)*0.0005) set.seed(10000) s_twitter <- sample(twitter, length(twitter)*0.0005) snwords.blogs <- stri_count_words(s_blogs) snwords.news <- stri_count_words(s_news) snwords.twitter <- stri_count_words(s_twitter) df.nwords.all <- data.frame(nword = c(snwords.blogs, snwords.twitter, snwords.news), type = c(rep("blog", length(snwords.blogs)), rep("twitter",length(snwords.twitter)), rep("news", length(snwords.news))) ) #Plotting the density probability of the frequency of the number of words per line. ggplot(data = df.nwords.all) + geom_density(aes(nword)) + facet_wrap(~type, nrow = 3) + xlim(0,500)
## Warning: Removed 1 rows containing non-finite values (stat_density).

#Create a corpus and clean it to see which words occur more often. I apply only for news source newsCorpus = Corpus(VectorSource(s_news)) newsCorpus = tm_map(newsCorpus, content_transformer(tolower)) newsCorpus = tm_map(newsCorpus, removePunctuation) newsCorpus = tm_map(newsCorpus, removeNumbers) newsDTM = TermDocumentMatrix(newsCorpus, control = list(minWordLength = 1)) mnews = as.matrix(newsDTM) newsOrder <- sort(rowSums(mnews), decreasing = TRUE) #Displaying which are the 10-top frequent words & 10-bottom words head(newsOrder, 10)
## the and for that said was with are his but ## 952 448 179 168 119 110 104 86 78 66
tail(newsOrder, 10)
## casa chandler grande occurred peak picacho ## 1 1 1 1 1 1 ## typical werenât yearââ âbecause ## 1 1 1 1
Conclusions
I have performed an exploratory analysis. From the text mining for the dataset, we sampling the dataset to get the high frequency of occurence of words>
After looking at the data there are some additional things
1 - More data cleansing and A good sampling
2Create a prediction model and Build an application.