Importing the data

Lets try to import the data from the raw files

dataURL<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

dataZip <- "Coursera-SwiftKey.zip"

    
if (!file.exists(dataZip)){
  download.file(dataURL, dataZip, method = "auto")
  unzip(dataZip)
}

Analysis Summary

Lets perform basic summaries and analysis to learn more about the raw data

Basic summary and analyses of Blogs file

library(ngram)
library(stringi)
library(tm)

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.4.1

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

en_blogs_raw <- readLines("en_US/en_US.blogs.txt",skipNul = T)



c_en_blogs_raw <- concatenate(en_blogs_raw)

# Word Counts in blogs file

wordcount(c_en_blogs_raw)

## [1] 37334131

# Line count in blogs file 

length(en_blogs_raw)

## [1] 899288

# Lets try to explore a bit more.Lets find number of words per line

blog_words_per_line<-stri_count_words(en_blogs_raw)



summary(blog_words_per_line)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.75   60.00 6726.00

## Lets sample out 2% of the data and remove the remaining data to save memory

sample_blog <- sample(en_blogs_raw, 0.02*length(en_blogs_raw)) 


rm(en_blogs_raw)
rm(c_en_blogs_raw)


## Lets create a  Corpus and clean the data to visualise the most occuring words in blog data

blog_corpus = Corpus(VectorSource(sample_blog))


#Remove all of the  punctuation
blog_corpus <- tm_map(blog_corpus, removePunctuation)

#Removes all of the numbers
blog_corpus <- tm_map(blog_corpus, removeNumbers)


#strip all of the  whitespaces
blog_corpus <- tm_map(blog_corpus, stripWhitespace) 

blog_dtm = TermDocumentMatrix(blog_corpus,control = list(minWordLength = 1))
blog_dtm_mat = as.matrix(blog_dtm)
blog_ordered <- sort(rowSums(blog_dtm_mat), decreasing = TRUE)

rm(blog_dtm_mat)

blog_head<-head(blog_ordered)

barplot(blog_head)

Basic summary of news file

en_news_raw <- readLines("en_US/en_US.news.txt",skipNul = T)


c_en_news_raw <- concatenate(en_news_raw)

# Word Counts in news file

wordcount(c_en_news_raw)

## [1] 34372530

# Line count in news file 

length(en_news_raw)

## [1] 1010242

# Lets try to explore a bit more.Lets find number of words per line

news_words_per_line<-stri_count_words(en_news_raw)

summary(news_words_per_line)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.41   46.00 1796.00

## Lets sample out 2% of the data and remove the remaining data to save memory

sample_news <- sample(en_news_raw, 0.02*length(en_news_raw)) 



rm(en_news_raw)
rm(c_en_news_raw)




## Lets create a  Corpus and clean the data to visualise the most occuring words in blog data

news_corpus <- Corpus(VectorSource(sample_news))

#Remove all of the  punctuation
news_corpus <- tm_map(news_corpus, removePunctuation)

#Removes all of the numbers
news_corpus <- tm_map(news_corpus, removeNumbers)

#strip all of the  whitespaces
news_corpus <- tm_map(news_corpus, stripWhitespace) 

news_dtm = TermDocumentMatrix(news_corpus,control = list(minWordLength = 1))
news_dtm_mat = as.matrix(news_dtm)
news_ordered <- sort(rowSums(news_dtm_mat), decreasing = TRUE)

rm(news_dtm_mat)

news_head<-head(news_ordered)


## Ploting top 10 words

barplot(news_head)

Basic summary of twitter file

en_tweet_raw <- readLines("en_US/en_US.twitter.txt",skipNul = T)


c_en_twitter_raw <- concatenate(en_tweet_raw)

# Word Counts in twitter file

wordcount(c_en_twitter_raw)

## [1] 30373583

# Line count in twitter file 

length(en_tweet_raw)

## [1] 2360148

# Lets try to explore a bit more.Lets find number of words per line

twitter_words_per_line<-stri_count_words(en_tweet_raw)

summary(twitter_words_per_line)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00

## Lets sample out 2% of the data and remove the remaining data to save memory

sample_tweet <- sample(en_tweet_raw, 0.02*length(en_tweet_raw)) 





rm(en_tweet_raw)
rm(c_en_twitter_raw)




## Lets create a  Corpus and clean the data to visualise the most occuring words in blog data

tweet_corpus <- Corpus(VectorSource(sample_tweet))

#Remove all of the  punctuation
tweet_corpus <- tm_map(tweet_corpus, removePunctuation)

#Removes all of the numbers
tweet_corpus <- tm_map(tweet_corpus, removeNumbers)


#strip all of the  whitespaces
tweet_corpus <- tm_map(tweet_corpus, stripWhitespace) 

tweet_dtm = TermDocumentMatrix(tweet_corpus,control = list(minWordLength = 1))
tweet_dtm_mat = as.matrix(tweet_dtm)
tweet_ordered <- sort(rowSums(tweet_dtm_mat), decreasing = TRUE)

tweet_head<-head(tweet_ordered)


## Ploting top 10 words

barplot(tweet_head)

rm(tweet_ordered)

Capstone_Week2

Nashit

8/29/2017

Introduction

Importing the data

Analysis Summary

Basic summary and analyses of Blogs file

Basic summary of news file

Basic summary of twitter file