Around the world, people are spending an increasing amount of time on their mobile devices for email, social networking, banking and a whole range of other activities. But typing on mobile devices can be a serious pain. SwiftKey, our corporate partner in this capstone, builds a smart keyboard that makes it easier for people to type on their mobile devices. One cornerstone of their smart keyboard is predictive text models. When someone types “I went to the”, the keyboard presents three options for what the next word might be. For example, the three words might be gym, store, restaurant. The three data sets provided by SwiftKey: blogs, news and twitter were downloaded and unzipped.
Packages installed:
library(knitr)
library(NLP)
library(tm)
library(stylo)
library(wordcloud)
library(ngram)
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
library(stringr)
library(R.utils)
Loading data:
blogs <- readLines("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.blogs.txt")
news <- readLines("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.news.txt")
twitter <- readLines("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.twitter.txt")
## Warning: line 167155 appears to contain an embedded nul
## Warning: line 268547 appears to contain an embedded nul
## Warning: line 1274086 appears to contain an embedded nul
## Warning: line 1759032 appears to contain an embedded nul
Number of lines in each file:
lenB <- length(blogs)
wordsB <- scan("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.blogs.txt", character(0))
lenN <- length(news)
wordsN <- scan("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.news.txt", character(0))
lenT <- length(twitter)
wordsT <- scan("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.twitter.txt", character(0))
## Warning: EOF within quoted string
## Warning: embedded nul(s) found in input
Result:
subBlogs <- sample(blogs, 5000)
subNews <- sample(news, 5000)
subTwitter <- sample(twitter, 5000)
Profanity filtering for all 3 data sets:
badwords <- as.list(read.table("/Users/arushigulati/Desktop/Capstone/final/en_US/badwords.txt", stringsAsFactors=FALSE))
myCorpusBlogs <- Corpus(VectorSource(subBlogs))
myCorpusBlogs = tm_map(myCorpusBlogs, removeWords, badwords$V1[1:500])
myCorpusNews <- Corpus(VectorSource(subNews))
myCorpusNews = tm_map(myCorpusNews, removeWords, badwords$V1[1:500])
myCorpusTwitter <- Corpus(VectorSource(subTwitter))
myCorpusTwitter = tm_map(myCorpusTwitter, removeWords, badwords$V1[1:500])
Removing punctuation, extra white spaces and converting all text to lowercase:
myCorpusBlogs = tm_map(myCorpusBlogs, removePunctuation)
myCorpusBlogs = tm_map(myCorpusBlogs, tolower)
myCorpusBlogs = tm_map(myCorpusBlogs, stripWhitespace)
myCorpusNews = tm_map(myCorpusNews, removePunctuation)
myCorpusNews = tm_map(myCorpusNews, stripWhitespace)
myCorpusNews = tm_map(myCorpusNews, tolower)
myCorpusTwitter = tm_map(myCorpusTwitter, removePunctuation)
myCorpusTwitter = tm_map(myCorpusTwitter, stripWhitespace)
myCorpusTwitter = tm_map(myCorpusTwitter, tolower)
Most frequently used words in data sets:
myCorpusBlogs <- tm_map(myCorpusBlogs, PlainTextDocument)
wordcloud(myCorpusBlogs, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8,'Dark2'))
myCorpusNews <- tm_map(myCorpusNews, PlainTextDocument)
wordcloud(myCorpusNews, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8,'Dark2'))
myCorpusTwitter <- tm_map(myCorpusTwitter, PlainTextDocument)
wordcloud(myCorpusTwitter, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8,'Dark2'))
Calculating bigrams:
myCorpusBlogsTemp <- txt.to.words(subBlogs)
bigramB <- make.ngrams(myCorpusBlogsTemp, ngram.size = 2)
bigramB <- table(bigramB)
bigramB <- sort(bigramB, decreasing = TRUE)
bigramB[1:20]
## bigramB
## of the in the it s to the on the to be i m don t and the
## 1028 848 467 464 420 363 329 328 321
## for the and i i have it was it is at the i was that i in a
## 311 289 287 270 259 256 252 252 245
## is a i am
## 244 243
myCorpusNewsTemp <- txt.to.words(subNews)
bigramN <- make.ngrams(myCorpusNewsTemp, ngram.size = 2)
bigramN <- table(bigramN)
bigramN <- sort(bigramN, decreasing = TRUE)
bigramN[1:20]
## bigramN
## of the in the to the on the it s for the at the and the
## 959 918 401 383 335 332 313 256
## to be in a with the from the he said of a will be with a
## 239 237 231 196 175 164 163 163
## for a that the is a don t
## 159 157 155 151
myCorpusTwitterTemp <- txt.to.words(subTwitter)
bigramT <- make.ngrams(myCorpusTwitterTemp, ngram.size = 2)
bigramT <- table(bigramT)
bigramT <- sort(bigramT, decreasing = TRUE)
bigramT[1:20]
## bigramT
## i m it s don t in the for the of the
## 266 186 170 166 148 134
## on the to be you re to the can t that s
## 111 91 90 89 86 85
## going to have a i am if you at the for a
## 79 79 76 76 73 73
## thanks for i can
## 72 70
Creating trigrams:
myCorpusBlogsTemp3 <- txt.to.words(subBlogs)
trigramB3 <- make.ngrams(myCorpusBlogsTemp3, ngram.size = 3)
trigramB3 <- data.frame(table(trigramB3))
trigramB3<-trigramB3[order(trigramB3$Freq,decreasing = TRUE),]
tempB<- trigramB3[1:20,]
ggplot(tempB,aes(x=trigramB3,y=Freq)) + geom_bar(stat="identity",fill="maroon") + labs(x="Three Gram Tokens") + labs(y="Frequency") + labs(title="Top 20 Three Gram Tokens") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
myCorpusNewsTemp3 <- txt.to.words(subNews)
trigramN3 <- make.ngrams(myCorpusNewsTemp3, ngram.size = 3)
trigramN3<-data.frame(table(trigramN3))
trigramN3<-trigramN3[order(trigramN3$Freq,decreasing = TRUE),]
tempN<- trigramN3[1:20,]
ggplot(tempN,aes(x=trigramN3,y=Freq)) + geom_bar(stat="identity",fill="maroon") + labs(x="Three Gram Tokens") + labs(y="Frequency") + labs(title="Top 20 Three Gram Tokens") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
myCorpusTwitterTemp3 <- txt.to.words(subTwitter)
trigramT3 <- make.ngrams(myCorpusTwitterTemp3, ngram.size = 3)
trigramT3<-data.frame(table(trigramT3))
trigramT3<-trigramT3[order(trigramT3$Freq,decreasing = TRUE),]
tempT<- trigramT3[1:20,]
ggplot(tempT,aes(x=trigramT3,y=Freq)) + geom_bar(stat="identity",fill="maroon") + labs(x="Three Gram Tokens") + labs(y="Frequency") + labs(title="Top 20 Three Gram Tokens") + theme(axis.text.x = element_text(angle = 45, hjust = 1))