The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.
Lets try to import the data from the raw files
dataURL<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dataZip <- "Coursera-SwiftKey.zip"
if (!file.exists(dataZip)){
download.file(dataURL, dataZip, method = "auto")
unzip(dataZip)
}
Lets perform basic summaries and analysis to learn more about the raw data
library(ngram)
library(stringi)
library(tm)
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.4.1
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
en_blogs_raw <- readLines("en_US/en_US.blogs.txt",skipNul = T)
c_en_blogs_raw <- concatenate(en_blogs_raw)
# Word Counts in blogs file
wordcount(c_en_blogs_raw)
## [1] 37334131
# Line count in blogs file
length(en_blogs_raw)
## [1] 899288
# Lets try to explore a bit more.Lets find number of words per line
blog_words_per_line<-stri_count_words(en_blogs_raw)
summary(blog_words_per_line)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 28.00 41.75 60.00 6726.00
## Lets sample out 2% of the data and remove the remaining data to save memory
sample_blog <- sample(en_blogs_raw, 0.02*length(en_blogs_raw))
rm(en_blogs_raw)
rm(c_en_blogs_raw)
## Lets create a Corpus and clean the data to visualise the most occuring words in blog data
blog_corpus = Corpus(VectorSource(sample_blog))
#remove all of the non-ascii characters
blog_corpus <- tm_map(blog_corpus, function(x) iconv(x, "latin1", "ASCII", sub=""))
#Remove all of the punctuation
blog_corpus <- tm_map(blog_corpus, removePunctuation)
#Removes all of the numbers
blog_corpus <- tm_map(blog_corpus, removeNumbers)
#Converts all upper cases to lower case
blog_corpus <- tm_map(blog_corpus, tolower)
#remove stopwords
blog_corpus <- tm_map(blog_corpus, removeWords, stopwords("english"))
#strip all of the whitespaces
blog_corpus <- tm_map(blog_corpus, stripWhitespace)
blog_dtm = TermDocumentMatrix(blog_corpus,control = list(minWordLength = 1))
blog_dtm_mat = as.matrix(blog_dtm)
blog_ordered <- sort(rowSums(blog_dtm_mat), decreasing = TRUE)
rm(blog_dtm_mat)
blog_head<-head(blog_ordered)
barplot(blog_head)
en_news_raw <- readLines("en_US/en_US.news.txt",skipNul = T)
c_en_news_raw <- concatenate(en_news_raw)
# Word Counts in news file
wordcount(c_en_news_raw)
## [1] 34372530
# Line count in news file
length(en_news_raw)
## [1] 1010242
# Lets try to explore a bit more.Lets find number of words per line
news_words_per_line<-stri_count_words(en_news_raw)
summary(news_words_per_line)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 32.00 34.41 46.00 1796.00
## Lets sample out 2% of the data and remove the remaining data to save memory
sample_news <- sample(en_news_raw, 0.02*length(en_news_raw))
rm(en_news_raw)
rm(c_en_news_raw)
## Lets create a Corpus and clean the data to visualise the most occuring words in blog data
news_corpus <- Corpus(VectorSource(sample_news))
#remove all of the non-ascii characters
news_corpus <- tm_map(news_corpus, function(x) iconv(x, "latin1", "ASCII", sub=""))
#Remove all of the punctuation
news_corpus <- tm_map(news_corpus, removePunctuation)
#Removes all of the numbers
news_corpus <- tm_map(news_corpus, removeNumbers)
#Converts all upper cases to lower case
news_corpus <- tm_map(news_corpus, tolower)
#remove stopwords
news_corpus <- tm_map(news_corpus, removeWords, stopwords("english"))
#strip all of the whitespaces
news_corpus <- tm_map(news_corpus, stripWhitespace)
news_dtm = TermDocumentMatrix(news_corpus,control = list(minWordLength = 1))
news_dtm_mat = as.matrix(news_dtm)
news_ordered <- sort(rowSums(news_dtm_mat), decreasing = TRUE)
rm(news_dtm_mat)
news_head<-head(news_ordered)
## Ploting top 10 words
barplot(news_head)
en_tweet_raw <- readLines("en_US/en_US.twitter.txt",skipNul = T)
c_en_twitter_raw <- concatenate(en_tweet_raw)
# Word Counts in twitter file
wordcount(c_en_twitter_raw)
## [1] 30373583
# Line count in twitter file
length(en_tweet_raw)
## [1] 2360148
# Lets try to explore a bit more.Lets find number of words per line
twitter_words_per_line<-stri_count_words(en_tweet_raw)
summary(twitter_words_per_line)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.75 18.00 47.00
## Lets sample out 2% of the data and remove the remaining data to save memory
sample_tweet <- sample(en_tweet_raw, 0.02*length(en_tweet_raw))
rm(en_tweet_raw)
rm(c_en_twitter_raw)
## Lets create a Corpus and clean the data to visualise the most occuring words in blog data
tweet_corpus <- Corpus(VectorSource(sample_tweet))
#remove all of the non-ascii characters
tweet_corpus <- tm_map(tweet_corpus, function(x) iconv(x, "latin1", "ASCII", sub=""))
#Remove all of the punctuation
tweet_corpus <- tm_map(tweet_corpus, removePunctuation)
#Removes all of the numbers
tweet_corpus <- tm_map(tweet_corpus, removeNumbers)
#Converts all upper cases to lower case
tweet_corpus <- tm_map(tweet_corpus, tolower)
#remove stopwords
tweet_corpus <- tm_map(tweet_corpus, removeWords, stopwords("english"))
#strip all of the whitespaces
tweet_corpus <- tm_map(tweet_corpus, stripWhitespace)
tweet_dtm = TermDocumentMatrix(tweet_corpus,control = list(minWordLength = 1))
tweet_dtm_mat = as.matrix(tweet_dtm)
tweet_ordered <- sort(rowSums(tweet_dtm_mat), decreasing = TRUE)
tweet_head<-head(tweet_ordered)
## Ploting top 10 words
barplot(tweet_head)
rm(tweet_ordered)