The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs http://rpubs.com/ that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.
The motivation for this project is to:
This is the Milestone Report Submission for the Data Science Capstone project. The goal of this project is create a predictive text model as SwiftKey, the corporate partner in this capstone, builds with its smart keyboards. Given a large Text Corpus of documents, called a corpora, Natural Language Processing tools and techniques are used to perform statistical analysis, occurences counts and associations on monolingual or multilingual texts.
The data sets made available for this project are extracted from the Internet, with 3 distinctive sources:
They are also provided in 4 different languages:
In the following data analysis, we will focus only on the English American data sets. Also, since the provided files are quite large, random subsets of the files will be extracted and retained for the analysis.
setwd("D:/$R!K@^T#/NEW_LEARNINGS/Data_Science/Data_Science_Met/C_10/Coursera-Swiftkey")
EN_US_Blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
EN_US_News <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
EN_US_Twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
summary <- data.frame('File' = c("EN_US_Blogs","EN_US_News","EN_US_Twitter"),
"File_Size" = sapply(list(EN_US_Blogs, EN_US_News, EN_US_Twitter), function(x){format(object.size(x),"MB")}),
'Row_Count' = sapply(list(EN_US_Blogs, EN_US_News, EN_US_Twitter), function(x){length(x)}),
'Total_Characters' = sapply(list(EN_US_Blogs, EN_US_News, EN_US_Twitter), function(x){sum(nchar(x))}),
'Max_Characters' = sapply(list(EN_US_Blogs, EN_US_News, EN_US_Twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
summary
## File File_Size Row_Count Total_Characters Max_Characters
## 1 EN_US_Blogs 255.4 Mb 899288 206824505 40833
## 2 EN_US_News 19.8 Mb 77259 15639408 5760
## 3 EN_US_Twitter 319 Mb 2360148 162096031 140
I am considering only data sets of english language.as these datasets are too large, I am considering 30% of data sets as part of poc.
I will explore twitter,news and blogs seperately.
r=file("final/en_US/en_US.blogs.txt")
text_blogs <- readLines(r,skipNul = T)
close(r)
library(quanteda)
library(tm)
set.seed(123)
blogs_sample <- sample(text_blogs, 0.03*length(text_blogs))
text_blogs_corp<-corpus(blogs_sample)
test<-texts(text_blogs_corp) %>%
char_tolower() %>%
tokens() %>%
tokens_remove(stopwords("english")) %>%
tokens_wordstem() %>%tokens_remove(" ")
Txt_blogs_dfm<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T)
textplot_wordcloud(Txt_blogs_dfm,rotation = 0.25,
color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
Txt_blogs_dfm_2<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T,ngrams=2)
textplot_wordcloud(Txt_blogs_dfm_2,rotation = 0.25,adjust=F,
color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
blogs_freq=data.frame(word=featnames(Txt_blogs_dfm),freq=colSums(Txt_blogs_dfm),row.names=NULL,stringsAsFactors = T)
test_cum=blogs_freq
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(blogs_freq$freq)/sum(blogs_freq$freq)
test_cum=test_cum[test_cum$propo<=0.05,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])
blogs_freq=blogs_freq[1:25,]
blogs_freq$word<-factor(blogs_freq$word,levels=unique(blogs_freq$word)[order(blogs_freq$freq,decreasing = T)])
library(ggplot2)
ggplot(blogs_freq[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('1 gram wor frequency of Blogs')
ggplot(test_cum,aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 1 gram words')
blogs_freq_2=data.frame(word=featnames(Txt_blogs_dfm_2),freq=colSums(Txt_blogs_dfm_2),row.names=NULL,stringsAsFactors = T)
test_cum=blogs_freq_2
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(blogs_freq_2$freq)/sum(blogs_freq_2$freq)
test_cum=test_cum[test_cum$propo<=0.0030000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])
blogs_freq_2=blogs_freq_2[1:25,]
blogs_freq_2$word<-factor(blogs_freq_2$word,levels=unique(blogs_freq_2$word)[order(blogs_freq_2$freq,decreasing = T)])
library(plotly)
ggplot(blogs_freq_2[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('2 gram wor frequency of Blogs')
ggplot(test_cum[1:50,],aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 2 gram words')
r=file("final/en_US/en_US.news.txt")
text_news <- readLines(r,skipNul = T)
close(r)
library(quanteda)
set.seed(123)
news_sample <- sample(text_news, 0.03*length(text_blogs))
text_news_corp<-corpus(news_sample)
test<-texts(text_news_corp) %>%
char_tolower() %>%
tokens() %>%
tokens_remove(stopwords("english")) %>%
tokens_wordstem()
Txt_news_dfm<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T)
textplot_wordcloud(Txt_blogs_dfm,rotation = 0.25,
color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
Txt_news_dfm_2<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T,ngrams=2)
textplot_wordcloud(Txt_news_dfm_2,rotation = 0.25,adjust=F,
color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
news_freq=data.frame(word=featnames(Txt_news_dfm),freq=colSums(Txt_news_dfm),row.names=NULL,stringsAsFactors = T)
test_cum=news_freq
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(news_freq$freq)/sum(news_freq$freq)
test_cum=test_cum[test_cum$propo<=0.005000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])
news_freq=news_freq[1:25,]
news_freq$word<-factor(news_freq$word,levels=unique(news_freq$word)[order(news_freq$freq,decreasing = T)])
library(plotly)
ggplot(news_freq[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('1 gram wor frequency of Blogs')
ggplot(test_cum,aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 1 gram words')
news_freq_2=data.frame(word=featnames(Txt_news_dfm_2),freq=colSums(Txt_news_dfm_2),row.names=NULL,stringsAsFactors = T)
test_cum=news_freq_2
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(news_freq_2$freq)/sum(news_freq_2$freq)
test_cum=test_cum[test_cum$propo<=0.003000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])
news_freq_2=news_freq_2[1:25,]
news_freq_2$word<-factor(news_freq_2$word,levels=unique(news_freq_2$word)[order(news_freq_2$freq,decreasing = T)])
library(plotly)
ggplot(news_freq_2[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('2 gram wor frequency of Blogs')
ggplot(test_cum[1:50,],aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 2 gram words')
r=file("final/en_US/en_US.twitter.txt")
text_twitter <- readLines(r,skipNul = T)
close(r)
library(quanteda)
set.seed(123)
twitter_sample <- sample(text_twitter, 0.03*length(text_blogs))
text_twitter_corp<-corpus(twitter_sample)
test<-texts(text_twitter_corp) %>%
char_tolower() %>%
tokens() %>%
tokens_remove(stopwords("english")) %>%
tokens_wordstem()
Txt_twitter_dfm<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T)
textplot_wordcloud(Txt_blogs_dfm,rotation = 0.25,
color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
Txt_twitter_dfm_2<-dfm(test,remove_punc=T,remove_numbers=T,remove_symbols=T,remove_url=T,remove_twitter=T,ngrams=2)
textplot_wordcloud(Txt_twitter_dfm_2,rotation = 0.25,adjust=F,
color = rev(RColorBrewer::brewer.pal(10, "RdBu")))
twitter_freq=data.frame(word=featnames(Txt_twitter_dfm),freq=colSums(Txt_twitter_dfm),row.names=NULL,stringsAsFactors = T)
test_cum=twitter_freq
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(twitter_freq$freq)/sum(twitter_freq$freq)
test_cum=test_cum[test_cum$propo<=0.05000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])
twitter_freq=twitter_freq[1:25,]
twitter_freq$word<-factor(twitter_freq$word,levels=unique(twitter_freq$word)[order(twitter_freq$freq,decreasing = T)])
library(plotly)
ggplot(twitter_freq[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('1 gram wor frequency of Blogs')
ggplot(test_cum,aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 1 gram words')
twitter_freq_2=data.frame(word=featnames(Txt_twitter_dfm_2),freq=colSums(Txt_twitter_dfm_2),row.names=NULL,stringsAsFactors = T)
test_cum=twitter_freq_2
test_cum=test_cum[order(-test_cum$freq),]
test_cum$propo<-cumsum(twitter_freq_2$freq)/sum(twitter_freq_2$freq)
test_cum=test_cum[test_cum$propo<=0.003000,]
test_cum$word<-factor(test_cum$word,levels=unique(test_cum$word)[order(test_cum$freq,decreasing = T)])
twitter_freq_2=twitter_freq_2[1:25,]
twitter_freq_2$word<-factor(twitter_freq_2$word,levels=unique(twitter_freq_2$word)[order(twitter_freq_2$freq,decreasing = T)])
library(plotly)
ggplot(twitter_freq_2[1:50,],aes(word,freq))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('2 gram wor frequency of Blogs')
ggplot(test_cum[1:50,],aes(word,propo))+geom_bar(stat = 'identity',colour='blue',fill='blue' )+ylab('Word Frequency')+theme_bw()+theme(axis.text.x=element_text(angle=45,hjust=1))+ggtitle('cumulative propotion of 2 gram words')
as per 1-gram and 2-gram word clouds, blogs, news and twitter data have differnt vocabilary. you can observe pattern of vacabilary of three sources have differnt patterns. You can observe forien words in three data sets.
I will use one type of data source for building and validating model as differnt sources have different vacubalry and try predicting word based on N gram model as well as N gram model with POS tagging. I will try to consider past vocubalary usage of user, foriegn words also and rare words also.