Introduction

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.

Importing the data

Lets try to import the data from the raw files

dataURL<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

dataZip <- "Coursera-SwiftKey.zip"

    
if (!file.exists(dataZip)){
  download.file(dataURL, dataZip, method = "auto")
  unzip(dataZip)
}

Analysis Summary

Lets perform basic summaries and analysis to learn more about the raw data

Basic summary and analyses of Blogs file

library(ngram)
library(stringi)
library(tm)
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.4.1
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
en_blogs_raw <- readLines("en_US/en_US.blogs.txt",skipNul = T)



c_en_blogs_raw <- concatenate(en_blogs_raw)

# Word Counts in blogs file

wordcount(c_en_blogs_raw)
## [1] 37334131
# Line count in blogs file 

length(en_blogs_raw)
## [1] 899288
# Lets try to explore a bit more.Lets find number of words per line

blog_words_per_line<-stri_count_words(en_blogs_raw)



summary(blog_words_per_line)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.75   60.00 6726.00
## Lets sample out 2% of the data and remove the remaining data to save memory

sample_blog <- sample(en_blogs_raw, 0.02*length(en_blogs_raw)) 


rm(en_blogs_raw)
rm(c_en_blogs_raw)


## Lets create a  Corpus and clean the data to visualise the most occuring words in blog data

blog_corpus = Corpus(VectorSource(sample_blog))


#Remove all of the  punctuation
blog_corpus <- tm_map(blog_corpus, removePunctuation)

#Removes all of the numbers
blog_corpus <- tm_map(blog_corpus, removeNumbers)


#strip all of the  whitespaces
blog_corpus <- tm_map(blog_corpus, stripWhitespace) 

blog_dtm = TermDocumentMatrix(blog_corpus,control = list(minWordLength = 1))
blog_dtm_mat = as.matrix(blog_dtm)
blog_ordered <- sort(rowSums(blog_dtm_mat), decreasing = TRUE)

rm(blog_dtm_mat)

blog_head<-head(blog_ordered)

barplot(blog_head)

Basic summary of news file

en_news_raw <- readLines("en_US/en_US.news.txt",skipNul = T)


c_en_news_raw <- concatenate(en_news_raw)

# Word Counts in news file

wordcount(c_en_news_raw)
## [1] 34372530
# Line count in news file 

length(en_news_raw)
## [1] 1010242
# Lets try to explore a bit more.Lets find number of words per line

news_words_per_line<-stri_count_words(en_news_raw)

summary(news_words_per_line)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.41   46.00 1796.00
## Lets sample out 2% of the data and remove the remaining data to save memory

sample_news <- sample(en_news_raw, 0.02*length(en_news_raw)) 



rm(en_news_raw)
rm(c_en_news_raw)




## Lets create a  Corpus and clean the data to visualise the most occuring words in blog data

news_corpus <- Corpus(VectorSource(sample_news))

#Remove all of the  punctuation
news_corpus <- tm_map(news_corpus, removePunctuation)

#Removes all of the numbers
news_corpus <- tm_map(news_corpus, removeNumbers)

#strip all of the  whitespaces
news_corpus <- tm_map(news_corpus, stripWhitespace) 

news_dtm = TermDocumentMatrix(news_corpus,control = list(minWordLength = 1))
news_dtm_mat = as.matrix(news_dtm)
news_ordered <- sort(rowSums(news_dtm_mat), decreasing = TRUE)

rm(news_dtm_mat)

news_head<-head(news_ordered)


## Ploting top 10 words

barplot(news_head)

Basic summary of twitter file

en_tweet_raw <- readLines("en_US/en_US.twitter.txt",skipNul = T)


c_en_twitter_raw <- concatenate(en_tweet_raw)

# Word Counts in twitter file

wordcount(c_en_twitter_raw)
## [1] 30373583
# Line count in twitter file 

length(en_tweet_raw)
## [1] 2360148
# Lets try to explore a bit more.Lets find number of words per line

twitter_words_per_line<-stri_count_words(en_tweet_raw)

summary(twitter_words_per_line)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00
## Lets sample out 2% of the data and remove the remaining data to save memory

sample_tweet <- sample(en_tweet_raw, 0.02*length(en_tweet_raw)) 





rm(en_tweet_raw)
rm(c_en_twitter_raw)




## Lets create a  Corpus and clean the data to visualise the most occuring words in blog data

tweet_corpus <- Corpus(VectorSource(sample_tweet))

#Remove all of the  punctuation
tweet_corpus <- tm_map(tweet_corpus, removePunctuation)

#Removes all of the numbers
tweet_corpus <- tm_map(tweet_corpus, removeNumbers)


#strip all of the  whitespaces
tweet_corpus <- tm_map(tweet_corpus, stripWhitespace) 

tweet_dtm = TermDocumentMatrix(tweet_corpus,control = list(minWordLength = 1))
tweet_dtm_mat = as.matrix(tweet_dtm)
tweet_ordered <- sort(rowSums(tweet_dtm_mat), decreasing = TRUE)

tweet_head<-head(tweet_ordered)


## Ploting top 10 words

barplot(tweet_head)

rm(tweet_ordered)