Aim: To perform exploratory analysis on the data sets provided for predictive text modeling.

Around the world, people are spending an increasing amount of time on their mobile devices for email, social networking, banking and a whole range of other activities. But typing on mobile devices can be a serious pain. SwiftKey, our corporate partner in this capstone, builds a smart keyboard that makes it easier for people to type on their mobile devices. One cornerstone of their smart keyboard is predictive text models. When someone types “I went to the”, the keyboard presents three options for what the next word might be. For example, the three words might be gym, store, restaurant. The three data sets provided by SwiftKey: blogs, news and twitter were downloaded and unzipped.

Data Processing

Packages installed:

library(knitr)
library(NLP)
library(tm)
library(stylo)
library(wordcloud)
library(ngram)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
library(stringr)
library(R.utils)

Loading data:

blogs <- readLines("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.blogs.txt")
news <- readLines("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.news.txt")
twitter <- readLines("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.twitter.txt")
## Warning: line 167155 appears to contain an embedded nul
## Warning: line 268547 appears to contain an embedded nul
## Warning: line 1274086 appears to contain an embedded nul
## Warning: line 1759032 appears to contain an embedded nul
Basic Analysis on Files

Number of lines in each file:

lenB <- length(blogs)
wordsB <- scan("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.blogs.txt", character(0))
lenN <- length(news)
wordsN <- scan("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.news.txt", character(0))
lenT <- length(twitter)
wordsT <- scan("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.twitter.txt", character(0))
## Warning: EOF within quoted string
## Warning: embedded nul(s) found in input

Result:

  • Blogs.txt:
    • Number of lines 899288
    • Number of words 35314175
  • News.txt:
    • Number of lines 1010242
    • Number of words- 29313276
  • Twitter.txt:
    • Number of lines 2360148
    • Number of words 9141409
Sub-sampling data
subBlogs <- sample(blogs, 5000)
subNews <- sample(news, 5000)
subTwitter <- sample(twitter, 5000)
Cleaning Data

Profanity filtering for all 3 data sets:

badwords <- as.list(read.table("/Users/arushigulati/Desktop/Capstone/final/en_US/badwords.txt", stringsAsFactors=FALSE))
myCorpusBlogs <- Corpus(VectorSource(subBlogs))
myCorpusBlogs = tm_map(myCorpusBlogs, removeWords, badwords$V1[1:500])
myCorpusNews <- Corpus(VectorSource(subNews))
myCorpusNews = tm_map(myCorpusNews, removeWords, badwords$V1[1:500])
myCorpusTwitter <- Corpus(VectorSource(subTwitter))
myCorpusTwitter = tm_map(myCorpusTwitter, removeWords, badwords$V1[1:500])

Removing punctuation, extra white spaces and converting all text to lowercase:

myCorpusBlogs = tm_map(myCorpusBlogs, removePunctuation)
myCorpusBlogs = tm_map(myCorpusBlogs, tolower)
myCorpusBlogs = tm_map(myCorpusBlogs, stripWhitespace)
myCorpusNews = tm_map(myCorpusNews, removePunctuation)
myCorpusNews = tm_map(myCorpusNews, stripWhitespace)
myCorpusNews = tm_map(myCorpusNews, tolower)
myCorpusTwitter = tm_map(myCorpusTwitter, removePunctuation)
myCorpusTwitter = tm_map(myCorpusTwitter, stripWhitespace)
myCorpusTwitter = tm_map(myCorpusTwitter, tolower)
Creating ngrams

Most frequently used words in data sets:

  • Blogs.txt
myCorpusBlogs <- tm_map(myCorpusBlogs, PlainTextDocument)
wordcloud(myCorpusBlogs, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8,'Dark2'))

plot of chunk blogsCloud

  • News.txt
myCorpusNews <- tm_map(myCorpusNews, PlainTextDocument)
wordcloud(myCorpusNews, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8,'Dark2'))

plot of chunk newsCloud

  • Twitter.txt
myCorpusTwitter <- tm_map(myCorpusTwitter, PlainTextDocument)
wordcloud(myCorpusTwitter, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8,'Dark2'))

plot of chunk twitterCloud

Calculating bigrams:

  • Blogs.txt
myCorpusBlogsTemp <- txt.to.words(subBlogs)
bigramB <- make.ngrams(myCorpusBlogsTemp, ngram.size = 2)
bigramB <- table(bigramB)
bigramB <- sort(bigramB, decreasing = TRUE)
bigramB[1:20]
## bigramB
##  of the  in the    it s  to the  on the   to be     i m   don t and the 
##    1028     848     467     464     420     363     329     328     321 
## for the   and i  i have  it was   it is  at the   i was  that i    in a 
##     311     289     287     270     259     256     252     252     245 
##    is a    i am 
##     244     243
  • News.txt
myCorpusNewsTemp <- txt.to.words(subNews)
bigramN <- make.ngrams(myCorpusNewsTemp, ngram.size = 2)
bigramN <- table(bigramN)
bigramN <- sort(bigramN, decreasing = TRUE)
bigramN[1:20]
## bigramN
##   of the   in the   to the   on the     it s  for the   at the  and the 
##      959      918      401      383      335      332      313      256 
##    to be     in a with the from the  he said     of a  will be   with a 
##      239      237      231      196      175      164      163      163 
##    for a that the     is a    don t 
##      159      157      155      151
  • Twitter.txt
myCorpusTwitterTemp <- txt.to.words(subTwitter)
bigramT <- make.ngrams(myCorpusTwitterTemp, ngram.size = 2)
bigramT <- table(bigramT)
bigramT <- sort(bigramT, decreasing = TRUE)
bigramT[1:20]
## bigramT
##        i m       it s      don t     in the    for the     of the 
##        266        186        170        166        148        134 
##     on the      to be     you re     to the      can t     that s 
##        111         91         90         89         86         85 
##   going to     have a       i am     if you     at the      for a 
##         79         79         76         76         73         73 
## thanks for      i can 
##         72         70

Creating trigrams:

  • Blogs.txt
myCorpusBlogsTemp3 <- txt.to.words(subBlogs)
trigramB3 <- make.ngrams(myCorpusBlogsTemp3, ngram.size = 3)
trigramB3 <- data.frame(table(trigramB3))
trigramB3<-trigramB3[order(trigramB3$Freq,decreasing = TRUE),]
tempB<- trigramB3[1:20,]
ggplot(tempB,aes(x=trigramB3,y=Freq)) + geom_bar(stat="identity",fill="maroon") + labs(x="Three Gram Tokens") + labs(y="Frequency") + labs(title="Top 20 Three Gram Tokens") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot of chunk triB

  • News.txt
myCorpusNewsTemp3 <- txt.to.words(subNews)
trigramN3 <- make.ngrams(myCorpusNewsTemp3, ngram.size = 3)
trigramN3<-data.frame(table(trigramN3))
trigramN3<-trigramN3[order(trigramN3$Freq,decreasing = TRUE),]
tempN<- trigramN3[1:20,]
ggplot(tempN,aes(x=trigramN3,y=Freq)) + geom_bar(stat="identity",fill="maroon") + labs(x="Three Gram Tokens") + labs(y="Frequency") + labs(title="Top 20 Three Gram Tokens") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot of chunk triN

  • Twitter.txt
myCorpusTwitterTemp3 <- txt.to.words(subTwitter)
trigramT3 <- make.ngrams(myCorpusTwitterTemp3, ngram.size = 3)
trigramT3<-data.frame(table(trigramT3))
trigramT3<-trigramT3[order(trigramT3$Freq,decreasing = TRUE),]
tempT<- trigramT3[1:20,]
ggplot(tempT,aes(x=trigramT3,y=Freq)) + geom_bar(stat="identity",fill="maroon") + labs(x="Three Gram Tokens") + labs(y="Frequency") + labs(title="Top 20 Three Gram Tokens") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot of chunk triT