This is the Milestone Report for the Coursera Data Science Capstone project. The goal of the capstone project is to create a predictive text model using a large text corpus of documents as training data. This Capstone project will be held in collaboration with SwiftKey. In this milestone report relationship between words, tokens, and phrases in the text is explored. This will eventually help to build our linguistic model.
if (!file.exists("Coursera-SwiftKey.zip")) {
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
## quanteda version 0.9.9.50
## Using 3 of 4 cores for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:quanteda':
##
## ngrams
##
## Attaching package: 'tm'
## The following objects are masked from 'package:quanteda':
##
## as.DocumentTermMatrix, stopwords
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
There are three files twitter.txt,blogs.txt and news.txt. There are 4 differnt language available but we will concentrate on english language only. The details of these files is shown below : Number of lines and total number of words.
length(dataTwitter)
## [1] 2360148
sum(stri_count_words(dataTwitter))
## [1] 30218166
length(dataBlogs)
## [1] 899288
sum(stri_count_words(dataBlogs))
## [1] 38154238
length(dataNews)
## [1] 77259
sum(stri_count_words(dataNews))
## [1] 2693898
Out of three files we will create corpus. We will first take sample of 5% and then consolidate all.
library(tm)
set.seed(5000)
data.sample <- c(sample(dataTwitter, length(dataTwitter) * 0.05),
sample(dataBlogs, length(dataBlogs) * 0.05),
sample(dataNews, length(dataNews) * 0.05))
sampleCorpus <- VCorpus(VectorSource(data.sample))
Assumption : ###While cleaning the data we won’t remove the numbers as some othe phrases while searching will require that. ###As a part of cleaning stemming should be done but that is not included as we will need all forms of word for our output.
Following are the steps performed for our cleaning purpose.
###tolower case
sampleCorpus <- tm_map(sampleCorpus, content_transformer(tolower))
###remove stopwords
sampleCorpus <- tm_map(sampleCorpus, removeWords, stopwords("english"))
###remove punctuation
sampleCorpus <- tm_map(sampleCorpus, removePunctuation)
###to plain text document
sampleCorpus <- tm_map(sampleCorpus, PlainTextDocument)
###remove URL
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
sampleCorpus <- tm_map(sampleCorpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
sampleCorpus <- tm_map(sampleCorpus,toSpace, "[^[:alpha:][:space:]]")
sampleCorpus <- tm_map(sampleCorpus,toSpace, "[^[:alnum:]]")
sampleCorpus <- tm_map(sampleCorpus,toSpace, "[[:punct:]]")
###Eliminating whitespace
sampleCorpus <- tm_map(sampleCorpus,stripWhitespace)
##ASCII filtering
sampleCorpus <- tm_map(sampleCorpus, content_transformer(function(x) iconv(x,to="ASCII", sub = " ")))
Once cleaning is done we have to do exploratory Analysis. For that we will be using quanteda package to create the corpus
Once the corpus is prepared we have to applye n-grams and prepare unigram, 2-gram and 3-gram from the corpus. For milestone report we will show top 30 words
sampleData <- corpus(sampleCorpus)
dfm_sampleData <- dfm(sampleData)
plot(dfm_sampleData, max.words=30, colors = brewer.pal(8, "Dark2"), scale=c(4, .5),min.freq=6)
plotDataUni <- topfeatures(dfm_sampleData, n=30)
plotDataUni_df <- data.frame(plotDataUni)
plotDataUni_df["unigram"] <- rownames(plotDataUni_df)
plotDataUni_df_plot <- ggplot(plotDataUni_df, aes(x=reorder(unigram, -plotDataUni), y=plotDataUni,fill=plotDataUni))
plotDataUni_df_plot <- plotDataUni_df_plot + geom_bar(position = "identity", stat = "identity")+coord_flip()
plotDataUni_df_plot
dfm_sampleDataBi <- dfm(sampleData, ignoredFeatures = stopwords("english"),ngrams=2)
plot(dfm_sampleDataBi, max.words=30, colors = brewer.pal(8, "Dark2"), scale=c(4, .5),min.freq=6)
plotDataBi <- topfeatures(dfm_sampleDataBi, n=30)
plotDataBi_df <- data.frame(plotDataBi)
plotDataBi_df["bigram"] <- rownames(plotDataBi_df)
plotDataBi_df_plot <- ggplot(plotDataBi_df, aes(x=reorder(bigram, -plotDataBi), y=plotDataBi,,fill=plotDataBi))
plotDataBi_df_plot <- plotDataBi_df_plot + geom_bar(position = "identity", stat = "identity")+coord_flip()
plotDataBi_df_plot
dfm_sampleDataTri <- dfm(sampleData, ignoredFeatures = stopwords("english"),ngrams=3)
plot(dfm_sampleDataTri, max.words=30, colors = brewer.pal(8, "Dark2"), scale=c(4, .5),min.freq=6)
plotDataTri <- topfeatures(dfm_sampleDataTri, n=30)
plotDataTri_df <- data.frame(plotDataTri)
plotDataTri_df["trigram"] <- rownames(plotDataTri_df)
plotDataTri_df_plot <- ggplot(plotDataTri_df, aes(x=reorder(trigram, -plotDataTri), y=plotDataTri,fill=plotDataTri))
plotDataTri_df_plot <- plotDataTri_df_plot + geom_bar(position = "identity", stat = "identity")+coord_flip()
plotDataTri_df_plot
Once the frequency matrix is there in the data table, Katz Backoff model is applied and discount and left out probabilities is calculated.
A snapshot of present data status
## For trigram
head(DT_Trigram)
## Words freq discount lastTerm PX PY firstTerms
## 1: don_t_know 172 1 know don t don_t
## 2: happy_mothers_day 153 1 day happy mothers happy_mothers
## 3: let_us_know 127 1 know let us let_us
## 4: i_m_sure 106 1 sure i m i_m
## 5: i_m_going 102 1 going i m i_m
## 6: don_t_want 101 1 want don t don_t
head(DT_Trigram_leftOverProb)
## firstTerms leftoverprob
## 1: don_t 0
## 2: happy_mothers 0
## 3: let_us 0
## 4: i_m 0
## 5: happy_new 0
## 6: looking_forward 0
##For bigram
head(DT_bigram)
## Words freq discount firstTerms lastTerm
## 1: it_s 1961 0 it s
## 2: i_m 1573 0 i m
## 3: don_t 1454 0 don t
## 4: right_now 1113 0 right now
## 5: didn_t 718 0 didn t
## 6: last_night 710 0 last night
head(DT_bigram_leftOverProb)
## firstTerms leftoverprob
## 1: it 0.9769648
## 2: i 0.9974150
## 3: don 0.9865839
## 4: right 0.6299592
## 5: didn 1.0000000
## 6: last 0.5837822
##For unigram
head(DT_unigram)
## Words freq discount
## 1: trader_joes 12666 1
## 2: night_hubby 12666 1
## 3: team_d 12666 1
## 4: props_tebow 12666 1
## 5: energy_boost 12666 1
## 6: city_dirt 12666 1