Peer Graded Assignment: Milestone Report

Synopsis

This is the Milestone Report Submission for the Data Science Capstone project. The goal of this project is create a predictive text model as SwiftKey, the corporate partner in this capstone, builds with its smart keyboards. Given a large Text Corpus of documents, called a corpora, Natural Language Processing tools and techniques are used to perform statistical analysis, occurences counts and associations on monolingual or multilingual texts.

The data sets made available for this project are extracted from the Internet, with 3 distinctive sources:

News,
Blogs and
Twitter feeds.

They are also provided in 4 different languages:

German,
English American,
Finnish and
Russian.

In the following data analysis, we will focus only on the English American data sets. Also, since the provided files are quite large, random subsets of the files will be extracted and retained for the analysis.

Prerequisites:

Download required Zip file
Unzip the zip file and
Set working directory where the files are placed.

Loding data into R and Creating a basic report of summary statistics about the data sets.

setwd("D:/$R!K@^T#/NEW_LEARNINGS/Data_Science/Data_Science_Met/C_10/Coursera-Swiftkey")

EN_US_Blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
EN_US_News <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
EN_US_Twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

summary <- data.frame('File' = c("EN_US_Blogs","EN_US_News","EN_US_Twitter"),
  "File_Size" = sapply(list(EN_US_Blogs, EN_US_News, EN_US_Twitter), function(x){format(object.size(x),"MB")}),
  'Row_Count' = sapply(list(EN_US_Blogs, EN_US_News, EN_US_Twitter), function(x){length(x)}),
  'Total_Characters' = sapply(list(EN_US_Blogs, EN_US_News, EN_US_Twitter), function(x){sum(nchar(x))}),
  'Max_Characters' = sapply(list(EN_US_Blogs, EN_US_News, EN_US_Twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
                      )
summary

##            File File_Size Row_Count Total_Characters Max_Characters
## 1   EN_US_Blogs  255.4 Mb    899288        206824505          40833
## 2    EN_US_News   19.8 Mb     77259         15639408           5760
## 3 EN_US_Twitter    319 Mb   2360148        162096031            140

Exploratory analysis of data.

I am considering only data sets of english language.as these datasets are too large, I am considering 30% of data sets as part of poc.

I will explore twitter,news and blogs seperately.

Creating summary statistics(word clud,frequency bar chart, cumulative frequency) blogs data

r=file("final/en_US/en_US.blogs.txt")
text_blogs <- readLines(r,skipNul = T)
close(r)
library(quanteda)

library(tm)

set.seed(123)
blogs_sample <- sample(text_blogs, 0.03*length(text_blogs))
text_blogs_corp<-corpus(blogs_sample)
test<-texts(text_blogs_corp) %>% 
    char_tolower() %>% 
    tokens() %>% 
    tokens_remove(stopwords("english")) %>%
    tokens_wordstem() %>%tokens_remove(" ")

Txt_blogs_dfm<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T)
textplot_wordcloud(Txt_blogs_dfm,rotation = 0.25, 
 color = rev(RColorBrewer::brewer.pal(10, "RdBu")))

Txt_blogs_dfm_2<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T,ngrams=2)

textplot_wordcloud(Txt_blogs_dfm_2,rotation = 0.25,adjust=F, 
 color = rev(RColorBrewer::brewer.pal(10, "RdBu")))

blogs_freq=data.frame(word=featnames(Txt_blogs_dfm),freq=colSums(Txt_blogs_dfm),row.names=NULL,stringsAsFactors = T)
test_cum=blogs_freq
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(blogs_freq$freq)/sum(blogs_freq$freq)
test_cum=test_cum[test_cum$propo<=0.05,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])


blogs_freq=blogs_freq[1:25,]

blogs_freq$word<-factor(blogs_freq$word,levels=unique(blogs_freq$word)[order(blogs_freq$freq,decreasing = T)])

library(ggplot2)

ggplot(blogs_freq[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('1 gram wor frequency of Blogs')

ggplot(test_cum,aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 1 gram words')

blogs_freq_2=data.frame(word=featnames(Txt_blogs_dfm_2),freq=colSums(Txt_blogs_dfm_2),row.names=NULL,stringsAsFactors = T)
test_cum=blogs_freq_2
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(blogs_freq_2$freq)/sum(blogs_freq_2$freq)
test_cum=test_cum[test_cum$propo<=0.0030000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])
blogs_freq_2=blogs_freq_2[1:25,]
blogs_freq_2$word<-factor(blogs_freq_2$word,levels=unique(blogs_freq_2$word)[order(blogs_freq_2$freq,decreasing = T)])
library(plotly)

ggplot(blogs_freq_2[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('2 gram wor frequency of Blogs')

ggplot(test_cum[1:50,],aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 2 gram words')

Creating summary statistics(word clud,frequency bar chart, cumulative frequency) news data

r=file("final/en_US/en_US.news.txt")
text_news <- readLines(r,skipNul = T)
close(r)
library(quanteda)
set.seed(123)
news_sample <- sample(text_news, 0.03*length(text_blogs))
text_news_corp<-corpus(news_sample)
test<-texts(text_news_corp) %>% 
    char_tolower() %>% 
    tokens() %>% 
    tokens_remove(stopwords("english")) %>%
    tokens_wordstem() 

Txt_news_dfm<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T)
textplot_wordcloud(Txt_blogs_dfm,rotation = 0.25, 
 color = rev(RColorBrewer::brewer.pal(10, "RdBu")))

Txt_news_dfm_2<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T,ngrams=2)
textplot_wordcloud(Txt_news_dfm_2,rotation = 0.25,adjust=F, 
 color = rev(RColorBrewer::brewer.pal(10, "RdBu")))

news_freq=data.frame(word=featnames(Txt_news_dfm),freq=colSums(Txt_news_dfm),row.names=NULL,stringsAsFactors = T)
test_cum=news_freq
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(news_freq$freq)/sum(news_freq$freq)
test_cum=test_cum[test_cum$propo<=0.005000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])


news_freq=news_freq[1:25,]

news_freq$word<-factor(news_freq$word,levels=unique(news_freq$word)[order(news_freq$freq,decreasing = T)])

library(plotly)

ggplot(news_freq[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('1 gram wor frequency of Blogs')

ggplot(test_cum,aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 1 gram words')

news_freq_2=data.frame(word=featnames(Txt_news_dfm_2),freq=colSums(Txt_news_dfm_2),row.names=NULL,stringsAsFactors = T)
test_cum=news_freq_2
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(news_freq_2$freq)/sum(news_freq_2$freq)
test_cum=test_cum[test_cum$propo<=0.003000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])
news_freq_2=news_freq_2[1:25,]
news_freq_2$word<-factor(news_freq_2$word,levels=unique(news_freq_2$word)[order(news_freq_2$freq,decreasing = T)])

library(plotly)

ggplot(news_freq_2[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('2 gram wor frequency of Blogs')

ggplot(test_cum[1:50,],aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 2 gram words')

Creating summary statistics(word clud,frequency bar chart, cumulative frequency) twitter data

r=file("final/en_US/en_US.twitter.txt")
text_twitter <- readLines(r,skipNul = T)
close(r)
library(quanteda)
set.seed(123)
twitter_sample <- sample(text_twitter, 0.03*length(text_blogs))
text_twitter_corp<-corpus(twitter_sample)
test<-texts(text_twitter_corp) %>% 
    char_tolower() %>% 
    tokens() %>% 
    tokens_remove(stopwords("english")) %>%
    tokens_wordstem() 

Txt_twitter_dfm<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T)
textplot_wordcloud(Txt_blogs_dfm,rotation = 0.25, 
 color = rev(RColorBrewer::brewer.pal(10, "RdBu")))

Txt_twitter_dfm_2<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T,ngrams=2)
textplot_wordcloud(Txt_twitter_dfm_2,rotation = 0.25,adjust=F, 
 color = rev(RColorBrewer::brewer.pal(10, "RdBu")))

twitter_freq=data.frame(word=featnames(Txt_twitter_dfm),freq=colSums(Txt_twitter_dfm),row.names=NULL,stringsAsFactors = T)
test_cum=twitter_freq
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(twitter_freq$freq)/sum(twitter_freq$freq)
test_cum=test_cum[test_cum$propo<=0.05000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])


twitter_freq=twitter_freq[1:25,]

twitter_freq$word<-factor(twitter_freq$word,levels=unique(twitter_freq$word)[order(twitter_freq$freq,decreasing = T)])

library(plotly)

ggplot(twitter_freq[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('1 gram wor frequency of Blogs')

ggplot(test_cum,aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 1 gram words')

twitter_freq_2=data.frame(word=featnames(Txt_twitter_dfm_2),freq=colSums(Txt_twitter_dfm_2),row.names=NULL,stringsAsFactors = T)
test_cum=twitter_freq_2
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(twitter_freq_2$freq)/sum(twitter_freq_2$freq)
test_cum=test_cum[test_cum$propo<=0.003000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])
twitter_freq_2=twitter_freq_2[1:25,]
twitter_freq_2$word<-factor(twitter_freq_2$word,levels=unique(twitter_freq_2$word)[order(twitter_freq_2$freq,decreasing = T)])

library(plotly)

ggplot(twitter_freq_2[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('2 gram wor frequency of Blogs')

ggplot(test_cum[1:50,],aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 2 gram words')

Infrences from Exploaratoray Anlaysis:

as per 1-gram and 2-gram word clouds, blogs, news and twitter data have differnt vocabilary. you can observe pattern of vacabilary of three sources have differnt patterns. You can observe forien words in three data sets.

Plans for Prediction Algorithm and Shiny App

I will use one type of data source for building and validating model as differnt sources have different vacubalry and try predicting word based on N gram model as well as N gram model with POS tagging. I will try to consider past vocubalary usage of user, foriegn words also and rare words also.