Instruction

The goal here is to build your first simple model for the relationship between words. This is the first step in building a predictive text mining application. You will explore simple models and discover more complicated modeling techniques.The goal of this project is just to display that we’ve gotten used to working with the data and that we are on track to create our prediction algorithm. A report on R Pubs (http://rpubs.com/) that explains the exploratory analysis and the goals for the eventual app and algorithm.

The motivation for this project is to:

  1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.
  2. Create a basic report of summary statistics about the data sets.
  3. Report any interesting findings that you amassed so far.
  4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

1.1 Download a file from Web

setwd("~/Downloads")

fileUrl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(fileUrl,destfile = "~/Downloads/Coursera-SwiftKey.zip",method="curl")
unzip(Coursera-SwiftKey.zip)

1.2 Read the english blogs,twitters and news text files into R:

setwd("~/Downloads/final/en_US")

#Blog
en_blog <- file("en_US.blogs.txt", "r")
Blog <- readLines(en_blog)
close(en_blog)
#News
en_news <- file("en_US.news.txt", "r")
News <- readLines(en_news)
close(en_news)
#Twitter
en_twitter <- file("en_US.twitter.txt", "r")
Twitter <- readLines(en_twitter,skipNul = TRUE)#nuls should be skipped.
close(en_twitter)

2.1 Basic summaries of the three files: en_Blog,en_News,en_Twitter.

library(stringi)#Get to know the lines and size of the files.
Summary<-data.frame(FilesName=c("Blog","News","Twitter"))
stri_stats_general(Blog)
##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   206824382   170389539
stri_stats_general(News)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     1010242     1010242   203223154   169860866
stri_stats_general(Twitter)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162096241   134082806
#Get to know the words count of the files.
Blogs.words <- stri_count_words(Blog)
News.words <- stri_count_words(News)
Twitter.words <- stri_count_words(Twitter)

2.2 Output of the Summary of the dataset.

Summary$Lines=(c(899288,1010242,2360148))#Extract from the stri
Summary$Chars=(c(206824382,203223154,162096241))#Extract from the stri
Summary$Words=(c(sum(Blogs.words),sum(News.words),sum(Twitter.words)))
Summary$Words_Average=(c(mean(Blogs.words),mean(News.words),mean(Twitter.words)))
Summary
##   FilesName   Lines     Chars    Words Words_Average
## 1      Blog  899288 206824382 37546246      41.75108
## 2      News 1010242 203223154 34762395      34.40997
## 3   Twitter 2360148 162096241 30093410      12.75065

2.3 Output Visualization

library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.4
g1<-qplot(FilesName,Lines,data=Summary,stat="summary",fun.y="sum",facet.~type,xlab="Type",ylab="",main="Lines",geom="bar",fill=factor(FilesName))

g1<-g1+theme(legend.position="none")

g2<-qplot(FilesName,Words,data=Summary,stat="summary",fun.y="sum",facet.~type,xlab="Type",ylab="",main="Words",geom="bar",fill=factor(FilesName))

g2<-g2+theme(legend.position="none")

g3<-qplot(FilesName,Words_Average,data=Summary,stat="summary",fun.y="sum",facet.~type,xlab="Type",ylab="",main="Words_Average",geom="bar",fill=factor(FilesName))

g3<-g3+theme(legend.position="none")

grid.arrange(g1,g2,g3,ncol=3)

3.Sampling

To reiterate, to build models we don’t need to load in and use all of the data. Often relatively few randomly selected rows or chunks need to be included to get an accurate approximation to results that would be obtained using all the data.

set.seed(1)
samplesize=10000
sample_Blog<-sample(Blog,samplesize)
sample_News<-sample(News,samplesize)
sample_Twitter<-sample(Twitter,samplesize)

4.Create the corpus for exploratory data analysis.

library(tm)
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(SnowballC)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.2.4
datasets<-data.frame(c(sample_Blog,sample_News,sample_Twitter))
datasets2 <- VectorSource(datasets)
Corpus <- Corpus(datasets2)
Corpus<-tm_map(Corpus, content_transformer(tolower))
Corpus<-tm_map(Corpus, PlainTextDocument)
Corpus<-tm_map(Corpus, removePunctuation)
Corpus<-tm_map(Corpus, removeWords, stopwords("english"))
Corpus <- tm_map(Corpus, removeNumbers)
Corpus <- tm_map(Corpus, stripWhitespace)
Corpus <- tm_map(Corpus, PlainTextDocument)
Corpus<-tm_map(Corpus, stemDocument)

# Gram Tokenizers
gram1Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 1, max = 1))}
gram2Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 2, max = 2))}
gram3Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 3, max = 3))}
gram4Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 4, max = 4))}

dtm1<- TermDocumentMatrix(Corpus, control = list(tokenize = gram1Tokenizer))
dtm2 <- TermDocumentMatrix(Corpus, control = list(tokenize = gram2Tokenizer))
dtm3<- TermDocumentMatrix(Corpus, control = list(tokenize = gram3Tokenizer))
dtm4<- TermDocumentMatrix(Corpus, control = list(tokenize = gram4Tokenizer))

dtm1_1 <- removeSparseTerms(dtm1, sparse = 0.99)
dtm2_2 <- removeSparseTerms(dtm2, sparse = 0.99)
dtm3_3 <- removeSparseTerms(dtm3, sparse = 0.99)
dtm4_4 <- removeSparseTerms(dtm4, sparse = 0.99)

5.Exploratory Data Analysis.

Feeback on the plans for creating a prediction algorithm and Shiny app.

The next step is to create a product to highlight the prediction algorithm that I have built and to provide an interface that can be accessed by others. The following two items will be submitted: 1.A Shiny app that takes as input a phrase (multiple words) in a text box input and outputs a prediction of the next word. 2.A slide deck consisting of no more than 5 slides created with R Studio Presenter (https://support.rstudio.com/hc/en-us/articles/200486468-Authoring-R-Presentations) pitching your algorithm and app as if you were presenting to your boss or an investor.