This report present some basic exploratory analysis on the data set given for SwiftKey. The main objective of this is apply data science in the area of natural language processing. The final result of this alalysis is to construct a shiny application which accepts some text input by the user and try to predict the next possible word in the user input. Here is the first part of data analysis of SwiftKay application.
The data for this application is available in 4 differnt languages.For each launguage we have 3 text file with text from blog, news/media and twitter. In our analysis we will focus on english data set.
setwd("~/Capstone/en_US/")
newsData <- readLines(file("en_US.news.txt"))
## Warning in readLines(file("en_US.news.txt")): incomplete final line found
## on 'en_US.news.txt'
blogData <- readLines(file("en_US.blogs.txt"))
twitterData <- readLines(file("en_US.twitter.txt"))
## Warning: line 167155 appears to contain an embedded nul
## Warning: line 268547 appears to contain an embedded nul
## Warning: line 1274086 appears to contain an embedded nul
## Warning: line 1759032 appears to contain an embedded nul
print(paste( " News data length = " ,length(newsData),
", Blog data length= " ,length(blogData),
", Twitter data length=", length(twitterData)
))
## [1] " News data length = 77259 , Blog data length= 899288 , Twitter data length= 2360148"
Sample data from three text file is marged togather to form a single Corpus
marged_data<- paste(newsData[1:4000],blogData[1:4000],twitterData[1:4000])
swearwords <- read.table('../profanity/Badword.txt', sep='\n')
names(swearwords)<- "swearwords"
filter <- rep('***',length(swearwords))
profanity <- data.frame(swearwords, target = filter)
library(tm)
## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
tokenizer <- function(Corpus,profanity=data.frame()){
Corpus <- Corpus(VectorSource(Corpus))
Corpus<-tm_map(Corpus, function(x) iconv(enc2utf8(x$content), sub = "byte"))
Corpus<-tm_map(Corpus,removeWords,stopwords("english"))
Corpus<-tm_map(Corpus,removePunctuation)
Corpus<-tm_map(Corpus,tolower)
Corpus<-tm_map(Corpus,removeNumbers)
Corpus<-tm_map(Corpus,stripWhitespace)
Corpus<-tm_map(Corpus,PlainTextDocument)
toString <- content_transformer(function(x, from, to) gsub(from, to, x))
for(i in nrow(profanity)){
Corpus<-tm_map(Corpus, toString, profanity[i,1], profanity[i,2])
}
return(Corpus)
}
tokenized_data <- tokenizer(marged_data,profanity)
## Warning: closing unused connection 5 (en_US.twitter.txt)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.2.3
# Converting Corpus into data frame
merge_df <- data.frame(text=unlist(sapply(tokenized_data, `[` , "content")),stringsAsFactors=F )
findNGrams <- function (data,grams){
ngram <- NGramTokenizer(data, Weka_control(min=grams,max=grams))
ngram <- data.frame(table(ngram))
names(ngram) <- c("term","count")
ngram <- ngram[order(ngram$count,decreasing = T),]
}
oneGram <- findNGrams(merge_df,1)
biGram <- findNGrams(merge_df,2)
triGram <- findNGrams(merge_df,3)
quartGram <- findNGrams(merge_df,4)
This show that we can cover 95% of the text using only 13094 word.
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
par(mfrow = c(2, 2))
palette <- brewer.pal(8,"Dark2")
wordcloud(words = oneGram$term[1:50], freq = oneGram$count[1:50],min.freq = 1,random.order = F,random.color = F,colors =palette )
wordcloud(words = biGram$term[1:50], freq = biGram$count[1:50],random.order = F,random.color = F,colors =palette ,min.freq = 1)
wordcloud(words = triGram$term[1:50], freq = triGram$count[1:50],random.order = F,random.color = F,colors =palette )
wordcloud(words = quartGram$term[1:50], freq = quartGram$count[1:50],random.order = F,random.color = F,colors =palette ,min.freq = 1)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.3
Gone<-ggplot()+
geom_bar(data = oneGram[1:15,],aes(reorder(term,-count),count),stat="identity",color="blue")+
ggtitle("Top 15 One Gram ")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Gtwo<-ggplot(biGram[1:15,],aes(reorder(term,-count),count))+
geom_bar(color="red",stat="identity")+
ggtitle("Top 15 Bi Gram ")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
GThree<-ggplot(triGram[1:15,],aes(reorder(term,-count),count))+
geom_bar(color="green",stat="identity")+ggtitle("Top 15 Tri Gram ")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
GFour<-ggplot(quartGram[1:15,],aes(reorder(term,-count),count))+
geom_bar(color="black",stat="identity")+ggtitle("Top 15 Quat Gram ")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(Gone, Gtwo,GThree,GFour, ncol = 2,nrow=2)
Most of the N-grams occurs only ones. Since they are not good for predicting (overfitting). We can use this fact for compressing the data compressing.
Since only 13094 word can use to predict 90% of the word so we can use this to reduce n-gram size.
Using N-gram to predict and built a simple shiny app