Summary

This report present some basic exploratory analysis on the data set given for SwiftKey. The main objective of this is apply data science in the area of natural language processing. The final result of this alalysis is to construct a shiny application which accepts some text input by the user and try to predict the next possible word in the user input. Here is the first part of data analysis of SwiftKay application.

DATA

The data for this application is available in 4 differnt languages.For each launguage we have 3 text file with text from blog, news/media and twitter. In our analysis we will focus on english data set.

Reading the data

setwd("~/Capstone/en_US/")
newsData <- readLines(file("en_US.news.txt"))
## Warning in readLines(file("en_US.news.txt")): incomplete final line found
## on 'en_US.news.txt'
blogData <- readLines(file("en_US.blogs.txt"))
twitterData <- readLines(file("en_US.twitter.txt"))
## Warning: line 167155 appears to contain an embedded nul
## Warning: line 268547 appears to contain an embedded nul
## Warning: line 1274086 appears to contain an embedded nul
## Warning: line 1759032 appears to contain an embedded nul
print(paste( " News data length = " ,length(newsData),
              ", Blog data length= " ,length(blogData),
              ", Twitter data length=",  length(twitterData)
            ))
## [1] " News data length =  77259 , Blog data length=  899288 , Twitter data length= 2360148"

Building a marge data.

Sample data from three text file is marged togather to form a single Corpus

marged_data<- paste(newsData[1:4000],blogData[1:4000],twitterData[1:4000])

Removing unwanted word from the data set.

swearwords <- read.table('../profanity/Badword.txt', sep='\n')
names(swearwords)<- "swearwords"
filter <- rep('***',length(swearwords))
profanity <- data.frame(swearwords, target = filter)

Converting data into Corpus.

library(tm)
## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
tokenizer <- function(Corpus,profanity=data.frame()){
  Corpus <- Corpus(VectorSource(Corpus))
  Corpus<-tm_map(Corpus, function(x) iconv(enc2utf8(x$content), sub = "byte"))
  Corpus<-tm_map(Corpus,removeWords,stopwords("english"))
  Corpus<-tm_map(Corpus,removePunctuation)
  Corpus<-tm_map(Corpus,tolower)
  Corpus<-tm_map(Corpus,removeNumbers)
  Corpus<-tm_map(Corpus,stripWhitespace)
  Corpus<-tm_map(Corpus,PlainTextDocument)
  
  toString <- content_transformer(function(x, from, to) gsub(from, to, x))
  
  for(i in nrow(profanity)){
    
    Corpus<-tm_map(Corpus, toString, profanity[i,1], profanity[i,2])
    
  }  
  
  return(Corpus)
}

tokenized_data <- tokenizer(marged_data,profanity)
## Warning: closing unused connection 5 (en_US.twitter.txt)

Finding n-grams in data.

library(RWeka)
## Warning: package 'RWeka' was built under R version 3.2.3
# Converting Corpus into data frame
merge_df <- data.frame(text=unlist(sapply(tokenized_data, `[` , "content")),stringsAsFactors=F )

findNGrams <- function (data,grams){
            ngram <- NGramTokenizer(data, Weka_control(min=grams,max=grams))
            ngram <- data.frame(table(ngram))
            names(ngram) <- c("term","count")
            ngram <- ngram[order(ngram$count,decreasing = T),]
            
}
oneGram <- findNGrams(merge_df,1)
biGram <- findNGrams(merge_df,2)
triGram <- findNGrams(merge_df,3)
quartGram <- findNGrams(merge_df,4)

Finding Coverage of words.

This show that we can cover 95% of the text using only 13094 word.

Exploratory data analysis

Ploting wordCount

library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
par(mfrow = c(2, 2))
palette <- brewer.pal(8,"Dark2")

wordcloud(words = oneGram$term[1:50], freq = oneGram$count[1:50],min.freq = 1,random.order = F,random.color = F,colors =palette )

wordcloud(words = biGram$term[1:50], freq = biGram$count[1:50],random.order = F,random.color = F,colors =palette ,min.freq = 1)

wordcloud(words = triGram$term[1:50], freq = triGram$count[1:50],random.order = F,random.color = F,colors =palette )


wordcloud(words = quartGram$term[1:50], freq = quartGram$count[1:50],random.order = F,random.color = F,colors =palette ,min.freq = 1)

Histograms of the grams

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.3
Gone<-ggplot()+
  geom_bar(data = oneGram[1:15,],aes(reorder(term,-count),count),stat="identity",color="blue")+
  ggtitle("Top 15 One Gram ")+ 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Gtwo<-ggplot(biGram[1:15,],aes(reorder(term,-count),count))+
  geom_bar(color="red",stat="identity")+
  ggtitle("Top 15 Bi Gram ")+ 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

GThree<-ggplot(triGram[1:15,],aes(reorder(term,-count),count))+
  geom_bar(color="green",stat="identity")+ggtitle("Top 15 Tri Gram ")+ 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

GFour<-ggplot(quartGram[1:15,],aes(reorder(term,-count),count))+
  geom_bar(color="black",stat="identity")+ggtitle("Top 15 Quat Gram ")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
       
grid.arrange(Gone, Gtwo,GThree,GFour, ncol = 2,nrow=2)

Summary

Most of the N-grams occurs only ones. Since they are not good for predicting (overfitting). We can use this fact for compressing the data compressing.

Since only 13094 word can use to predict 90% of the word so we can use this to reduce n-gram size.

Next step for app

Using N-gram to predict and built a simple shiny app