This milestone report of Capstone Project is to demonstrate the progress made in learning of the data science. In this project the objective is to build a data product that applies data science in the area of natural language processing and build a Shiny App that acceps some text input and predicts the possible next work would be.
The specified dataset for this project is available for download at https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
This dataset contains German, Engilsh, Finnish and Russian files. I will be using the Enghlish dataset : en_US.news.txt, en_US.twitter.txt, en_US.blogs.txt
setwd("/Users/thyagarr/coursera/Capstone/final/en_US")
blogsData <- readLines(file("en_US.blogs.txt"))
newsData <- readLines(file("en_US.news.txt"))
twitterData <- readLines(file("en_US.twitter.txt"))
print(paste("Blog length = ", length(blogsData), ", News length = ", length(newsData), " ,Twitter length = ", length(twitterData) ))
## [1] "Blog length = 899288 , News length = 1010242 ,Twitter length = 2360148"
print(paste("Blog word count =", length(strsplit(blogsData,split = '')),
", News word count =", length(strsplit(newsData,split = ' ')),
", Twitter word count =", length(strsplit(blogsData,split = ''))
))
## [1] "Blog word count = 899288 , News word count = 1010242 , Twitter word count = 899288"
We can merge the three files together and using the tm library to build a text corpus. We will be using the 5% sample size of randomly selected lines.
merged<-paste(blogsData,newsData,twitterData)
#pick ramdom sample
line_cnt<-length(merged)
set.seed(7789)
merged_idx<-rbinom(line_cnt,1,0.005)
sample_data<-merged[merged_idx==1]
sample_data<-unique(sample_data)
The number of samples we will be exploring is
length(sample_data)
## [1] 11838
set.seed(7789)
corpus<-VCorpus(VectorSource(sample_data))
corpus<-tm_map(corpus,tolower)
corpus<-tm_map(corpus,removeNumbers)
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,removeWords,stopwords("english"))
corpus <- gsub("http\\w+", "", corpus)
The goal is to pridict the next word based on a sequence of words as input. To achieve this, we have to evaluate n-grams and the frequency in the training data.
setwd("/Users/thyagarr/coursera/Capstone/final/en_US")
library(RWeka)
options(mc.cores=1)
findNGrams <- function(dataframe, n) {
ngram <- NGramTokenizer(dataframe, Weka_control(min = n, max = n, delimiters = " \\r\\n\\t.,;:\"()?!"))
ngram2 <- data.frame(table(ngram))
ngram3 <- ngram2[order(ngram2$Freq, decreasing = TRUE), ]
colnames(ngram3) <- c("text", "frequency")
ngram3
}
Unigram <- findNGrams(corpus, 1)
BiGrams <- findNGrams(corpus, 2)
TriGrams <- findNGrams(corpus, 3)
QuadGrams <- findNGrams(corpus, 4)
Top20.1gram <- head(Unigram, 20)
Top20.2gram <- head(BiGrams, 20)
Top20.3gram <- head(TriGrams, 20)
Top20.4gram <- head(QuadGrams, 20)
barplot(Top20.1gram[,2], cex.names = 0.5,names.arg = Top20.1gram[,1], col="green", main="Top 20 most freequent 1-Gram", las=2)
barplot(Top20.2gram[,2], cex.names = 0.5,names.arg = Top20.2gram[,1], col="red", main="Top 20 most freequent 2-Gram", las=2)
barplot(Top20.3gram[,2], cex.names = 0.5,names.arg = Top20.3gram[,1], col="blue", main="Top 20 most freequent 3-Gram", las=2)
barplot(Top20.4gram[,2], cex.names = 0.5,names.arg = Top20.4gram[,1], col="black", main="Top 20 most freequent 4-Gram", las=2)
WordClouds
library(wordcloud)
## Loading required package: RColorBrewer
require(RColorBrewer)
palette <- brewer.pal(8,"Dark2")
wordcloud(Top20.2gram[,1], Top20.2gram[,2], min.freq =1, random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "2-gram cloud")
wordcloud(Top20.3gram[,1], Top20.3gram[,2], min.freq =1, random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "3-gram cloud")
wordcloud(Top20.4gram[,1], Top20.4gram[,2], min.freq =1, random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "4-gram cloud")