The goal here is to build your first simple model for the relationship between words. This is the first step in building a predictive text mining application. You will explore simple models and discover more complicated modeling techniques.The goal of this project is just to display that we’ve gotten used to working with the data and that we are on track to create our prediction algorithm. A report on R Pubs (http://rpubs.com/) that explains the exploratory analysis and the goals for the eventual app and algorithm.
setwd("~/Downloads")
fileUrl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(fileUrl,destfile = "~/Downloads/Coursera-SwiftKey.zip",method="curl")
unzip(Coursera-SwiftKey.zip)
setwd("~/Downloads/final/en_US")
#Blog
en_blog <- file("en_US.blogs.txt", "r")
Blog <- readLines(en_blog)
close(en_blog)
#News
en_news <- file("en_US.news.txt", "r")
News <- readLines(en_news)
close(en_news)
#Twitter
en_twitter <- file("en_US.twitter.txt", "r")
Twitter <- readLines(en_twitter,skipNul = TRUE)#nuls should be skipped.
close(en_twitter)
library(stringi)#Get to know the lines and size of the files.
Summary<-data.frame(FilesName=c("Blog","News","Twitter"))
stri_stats_general(Blog)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
stri_stats_general(News)
## Lines LinesNEmpty Chars CharsNWhite
## 1010242 1010242 203223154 169860866
stri_stats_general(Twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096241 134082806
#Get to know the words count of the files.
Blogs.words <- stri_count_words(Blog)
News.words <- stri_count_words(News)
Twitter.words <- stri_count_words(Twitter)
Summary$Lines=(c(899288,1010242,2360148))#Extract from the stri
Summary$Chars=(c(206824382,203223154,162096241))#Extract from the stri
Summary$Words=(c(sum(Blogs.words),sum(News.words),sum(Twitter.words)))
Summary$Words_Average=(c(mean(Blogs.words),mean(News.words),mean(Twitter.words)))
Summary
## FilesName Lines Chars Words Words_Average
## 1 Blog 899288 206824382 37546246 41.75108
## 2 News 1010242 203223154 34762395 34.40997
## 3 Twitter 2360148 162096241 30093410 12.75065
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.4
g1<-qplot(FilesName,Lines,data=Summary,stat="summary",fun.y="sum",facet.~type,xlab="Type",ylab="",main="Lines",geom="bar",fill=factor(FilesName))
g1<-g1+theme(legend.position="none")
g2<-qplot(FilesName,Words,data=Summary,stat="summary",fun.y="sum",facet.~type,xlab="Type",ylab="",main="Words",geom="bar",fill=factor(FilesName))
g2<-g2+theme(legend.position="none")
g3<-qplot(FilesName,Words_Average,data=Summary,stat="summary",fun.y="sum",facet.~type,xlab="Type",ylab="",main="Words_Average",geom="bar",fill=factor(FilesName))
g3<-g3+theme(legend.position="none")
grid.arrange(g1,g2,g3,ncol=3)
To reiterate, to build models we don’t need to load in and use all of the data. Often relatively few randomly selected rows or chunks need to be included to get an accurate approximation to results that would be obtained using all the data.
set.seed(1)
samplesize=10000
sample_Blog<-sample(Blog,samplesize)
sample_News<-sample(News,samplesize)
sample_Twitter<-sample(Twitter,samplesize)
library(tm)
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(SnowballC)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.2.4
datasets<-data.frame(c(sample_Blog,sample_News,sample_Twitter))
datasets2 <- VectorSource(datasets)
Corpus <- Corpus(datasets2)
Corpus<-tm_map(Corpus, content_transformer(tolower))
Corpus<-tm_map(Corpus, PlainTextDocument)
Corpus<-tm_map(Corpus, removePunctuation)
Corpus<-tm_map(Corpus, removeWords, stopwords("english"))
Corpus <- tm_map(Corpus, removeNumbers)
Corpus <- tm_map(Corpus, stripWhitespace)
Corpus <- tm_map(Corpus, PlainTextDocument)
Corpus<-tm_map(Corpus, stemDocument)
# Gram Tokenizers
gram1Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 1, max = 1))}
gram2Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 2, max = 2))}
gram3Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 3, max = 3))}
gram4Tokenizer <- function(x) {NGramTokenizer(x, RWeka::Weka_control(min = 4, max = 4))}
dtm1<- TermDocumentMatrix(Corpus, control = list(tokenize = gram1Tokenizer))
dtm2 <- TermDocumentMatrix(Corpus, control = list(tokenize = gram2Tokenizer))
dtm3<- TermDocumentMatrix(Corpus, control = list(tokenize = gram3Tokenizer))
dtm4<- TermDocumentMatrix(Corpus, control = list(tokenize = gram4Tokenizer))
dtm1_1 <- removeSparseTerms(dtm1, sparse = 0.99)
dtm2_2 <- removeSparseTerms(dtm2, sparse = 0.99)
dtm3_3 <- removeSparseTerms(dtm3, sparse = 0.99)
dtm4_4 <- removeSparseTerms(dtm4, sparse = 0.99)
The next step is to create a product to highlight the prediction algorithm that I have built and to provide an interface that can be accessed by others. The following two items will be submitted: 1.A Shiny app that takes as input a phrase (multiple words) in a text box input and outputs a prediction of the next word. 2.A slide deck consisting of no more than 5 slides created with R Studio Presenter (https://support.rstudio.com/hc/en-us/articles/200486468-Authoring-R-Presentations) pitching your algorithm and app as if you were presenting to your boss or an investor.