The goal of the Data Science Capstone Project is to build a predictive model (Natural Language Processing) to predict the next word. Given a word or phrase as input, the product/application shall try to predict the next word.
Milestone report shows an exploratory analysis has done on the training data to understand the distribution and relationship between the words, tokens, and phrases in the text. This goal of this report exploratory analysis of the data, understand the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and pairs - build figures and tables to understand variation in the frequencies of the words and word pairs in the data.
Capstone Dataset is available at : https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
The dataset contains data obtain from Blogs post, News Feeds and Tweets from twitter. They are saved as txt format with \n newline formation.
Setting directory and clean all old object
setwd("/home/alok/capstone_swk")
rm(list = ls())
Load the library ( Note : Not using all the libraries here but shall use in final application)
suppressWarnings(suppressMessages(library(igraph)))
suppressWarnings(suppressMessages(library(biclust)))
suppressWarnings(suppressMessages(library(RColorBrewer)))
suppressWarnings(suppressMessages(library(tm)))
suppressWarnings(suppressMessages(library(SnowballC)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(wordcloud)))
suppressWarnings(suppressMessages(library(cluster)))
suppressWarnings(suppressMessages(library(RWeka)))
suppressWarnings(suppressMessages(library(caTools)))
suppressWarnings(suppressMessages(library(rpart)))
suppressWarnings(suppressMessages(library(rpart.plot)))
suppressWarnings(suppressMessages(library(randomForest)))
suppressWarnings(suppressMessages(library(wordcloud)))
suppressWarnings(suppressMessages(library(qdap)))
suppressWarnings(suppressMessages(library (biclust)))
suppressWarnings(suppressMessages(library (cluster)))
suppressWarnings(suppressMessages(library (igraph)))
Download the data
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",destfile="Coursera-SwiftKey.zip",method="curl")
unzip("Coursera-SwiftKey.zip")
blogsfile <- "final/en_US/en_US.blogs.txt"
newsfile <- "final/en_US/en_US.news.txt"
twitterfile <- "final/en_US/en_US.twitter.txt"
combine_files <- "final/en_US/en_US.all.txt"
combine_clean_files <- "final/en_US/en_US.all_3.txt"
Raw data Summary (all_file_cat* data from unix script)
| file_name | | Uniq_words_Types | | FileSize(MB) | | line_counts | | word_counts_Tokens | | Char_counts |
|---|---|---|---|---|---|
| blogs | | 1214516 | | 200.4297 | | 899288 | | 37334114 | | 210160014 |
| news | | 945730 | | 196.2812 | | 1010242 | | 34365936 | | 205811889 |
| twitter | | 1443911 | | 159.3672 | | 2360148 | | 30359804 | | 167105338 |
| Sum(bl+Ne|Tw) | | 2825934 | | 556.0781 | | 4269678 | | 102059854 | | 583077241 |
| all_file_cat | | 2825934 | | 556.0703 | | 4269678 | | 103041866 | | 583077241 |
| all_file_cat_c | | 541029 | | 525.8711 | | 4269678 | | 103041866 | | 551410913 |
**Unix Code**
combine_files <- (system ("cat final/en_US/en_US.blogs.txt final/en_US/en_US.news.txt final/en_US/en_US.twitter.txt > final/en_US/en_US.all.txt", intern=TRUE))
system ("bash /home/alok/capstone_swk/final/en_US/data_clean.bash")
cat data_clean.bash
#!/bin/bash
cd /home/alok/capstone_swk/final/en_US
#Upper to lower covert
sed 's/\([A-Z]\)/\L\1/g' en_US.all.txt > en_US.all_1.txt
#Remove all number and special char pass / to special char
sed 's/[^a-z]/ /g;' en_US.all_1.txt > en_US.all_2.txt
#Remove Space
awk '{$1=$1};1' en_US.all_2.txt > en_US.all_3.txt
rm en_US.all_1.txt
rm en_US.all_2.txt
## identified Uniq work and frequency and sort by column 1 with n
## count to 50 words "tail -n 50 en_US.all_3_uniq.txt"
cat en_US.all_3.txt|tr " " "\n"|sort |uniq -c|sort -k 1n -r > en_US.all_3_uniq.txt
Read and Sampling the Data
blogs <- readLines(blogsfile, encoding="UTF-8", warn=FALSE, skipNul=TRUE)
news <- readLines(newsfile, encoding="UTF-8", warn=FALSE, skipNul=TRUE)
twitter <- readLines(twitterfile, encoding="UTF-8", warn=FALSE, skipNul=TRUE)
**Random sample or documents or features Approx 1% of 899288 is 8992 Uniq word is high so gt 1% lt 1.5%
Blogs_sample <- sample(blogs, 10000)
Approx 1% of 1010242 is 10102
News_sample <- sample(news, 11102)
Approx 1% of 2360148 is 23601
Twitter_sample <- sample(twitter, 23601)
Remove logs,news and twitter objects
rm(blogs,news,twitter)
Clean Sample data
sdata <- c(Blogs_sample, News_sample, Twitter_sample)
Remove Hash tags
sdata <- gsub(" #\\S*","", sdata)
Remove URLs
sdata <- gsub("(f|ht)(tp)(s?)(://)(\\S*)", "", sdata)
Remove twitter accounts
sdata <- gsub(" @[^\\s]+","",sdata)
Remove special characters
sdata <- gsub("[^0-9A-Za-z///' ]", "", sdata)
Build the corpus from sample data
corpus <- Corpus(VectorSource(sdata))
Clean the corpus
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords,stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
Generate the Data Frame from corpus and remove corpus object
sample_df = data.frame(text=unlist(sapply(corpus, '[',"content")),stringsAsFactors=F)
token_delim =" \\t\\r\\n.!?,;\"()"
rm(corpus )
Tokenize accordingly
UnigramTokenizer = NGramTokenizer(sample_df, Weka_control(min=1,max=1))
BigramTokenizer = NGramTokenizer(sample_df, Weka_control(min=2,max=2, delimiters = token_delim))
TrigramTokenizer = NGramTokenizer(sample_df, Weka_control(min=3,max=3, delimiters = token_delim))
QuadgramTokenizer = NGramTokenizer(sample_df, Weka_control(min=4,max=4, delimiters = token_delim))
convert to data frame
unigramTable=data.frame(table(UnigramTokenizer))
bigramTable=data.frame(table(BigramTokenizer))
trigramTable=data.frame(table(TrigramTokenizer))
QrigramTable=data.frame(table(QuadgramTokenizer))
Sort nGrams
unigramTable=unigramTable[order(unigramTable$Freq,decreasing = TRUE),]
bigramTable=bigramTable[order(bigramTable$Freq,decreasing = TRUE),]
trigramTable=trigramTable[order(trigramTable$Freq,decreasing = TRUE),]
QrigramTable=QrigramTable[order(QrigramTable$Freq,decreasing = TRUE),]
Unigram bar plot and word cloud
ggplot(unigramTable[1:25,], aes(x=reorder(UnigramTokenizer,-Freq,sum),y=Freq,), ) + geom_bar(stat="Identity", fill="red") +geom_text(aes(label=Freq), vjust=-0.4) + labs(x="Words") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle(paste("1-Gram: Top 25 Occurrence Words"))
wordcloud(unigramTable$UnigramTokenizer, unigramTable$Freq, scale=c(3,0.1),colors=brewer.pal(6, "Dark2"),rot.per=0.35, max.words=40)
Bigram bar plot and word cloud
ggplot(bigramTable[1:25,], aes(x=reorder(BigramTokenizer,-Freq,sum),y=Freq,), ) + geom_bar(stat="Identity", fill="green") +geom_text(aes(label=Freq), vjust=-0.4) + labs(x="Words") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle(paste("2-Gram: Top 25 Occurrence Words"))
wordcloud(bigramTable$BigramTokenizer, bigramTable$Freq, scale=c(3,0.1), colors=brewer.pal(6, "Dark2"),rot.per=0.35, max.words=40)
Trigram bar plot
ggplot(trigramTable[1:25,], aes(x=reorder(TrigramTokenizer,-Freq,sum),y=Freq,), ) + geom_bar(stat="Identity", fill="red") +geom_text(aes(label=Freq), vjust=-0.4) + labs(x="Words") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle(paste("3-Gram: Top 25 Occurrence Words"))
Qrigram bar plot
ggplot(QrigramTable[1:25,], aes(x=reorder(QuadgramTokenizer,-Freq,sum),y=Freq,), ) + geom_bar(stat="Identity", fill="green") +geom_text(aes(label=Freq), vjust=-0.4) + labs(x="Words") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle(paste("4-Gram: Top 25 Occurrence Words"))