Natural Language Processing: Predictive Text Exploratory Analysis

Data Processing

Packages installed:

library(knitr)
library(NLP)
library(tm)
library(stylo)
library(wordcloud)
library(ngram)
library(ggplot2)

## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate

library(stringr)
library(R.utils)

Loading data:

blogs <- readLines("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.blogs.txt")
news <- readLines("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.news.txt")
twitter <- readLines("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.twitter.txt")

## Warning: line 167155 appears to contain an embedded nul
## Warning: line 268547 appears to contain an embedded nul
## Warning: line 1274086 appears to contain an embedded nul
## Warning: line 1759032 appears to contain an embedded nul

Basic Analysis on Files

Number of lines in each file:

lenB <- length(blogs)
wordsB <- scan("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.blogs.txt", character(0))
lenN <- length(news)
wordsN <- scan("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.news.txt", character(0))
lenT <- length(twitter)
wordsT <- scan("/Users/arushigulati/Desktop/Capstone/final/en_US/en_US.twitter.txt", character(0))

## Warning: EOF within quoted string
## Warning: embedded nul(s) found in input

Result:

Blogs.txt:
- Number of lines 899288
- Number of words 35314175
News.txt:
- Number of lines 1010242
- Number of words- 29313276
Twitter.txt:
- Number of lines 2360148
- Number of words 9141409

Sub-sampling data

subBlogs <- sample(blogs, 5000)
subNews <- sample(news, 5000)
subTwitter <- sample(twitter, 5000)

Cleaning Data

Profanity filtering for all 3 data sets:

badwords <- as.list(read.table("/Users/arushigulati/Desktop/Capstone/final/en_US/badwords.txt", stringsAsFactors=FALSE))
myCorpusBlogs <- Corpus(VectorSource(subBlogs))
myCorpusBlogs = tm_map(myCorpusBlogs, removeWords, badwords$V1[1:500])
myCorpusNews <- Corpus(VectorSource(subNews))
myCorpusNews = tm_map(myCorpusNews, removeWords, badwords$V1[1:500])
myCorpusTwitter <- Corpus(VectorSource(subTwitter))
myCorpusTwitter = tm_map(myCorpusTwitter, removeWords, badwords$V1[1:500])

Removing punctuation, extra white spaces and converting all text to lowercase:

myCorpusBlogs = tm_map(myCorpusBlogs, removePunctuation)
myCorpusBlogs = tm_map(myCorpusBlogs, tolower)
myCorpusBlogs = tm_map(myCorpusBlogs, stripWhitespace)
myCorpusNews = tm_map(myCorpusNews, removePunctuation)
myCorpusNews = tm_map(myCorpusNews, stripWhitespace)
myCorpusNews = tm_map(myCorpusNews, tolower)
myCorpusTwitter = tm_map(myCorpusTwitter, removePunctuation)
myCorpusTwitter = tm_map(myCorpusTwitter, stripWhitespace)
myCorpusTwitter = tm_map(myCorpusTwitter, tolower)

Creating ngrams

Most frequently used words in data sets:

Blogs.txt

myCorpusBlogs <- tm_map(myCorpusBlogs, PlainTextDocument)
wordcloud(myCorpusBlogs, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8,'Dark2'))

plot of chunk blogsCloud

News.txt

myCorpusNews <- tm_map(myCorpusNews, PlainTextDocument)
wordcloud(myCorpusNews, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8,'Dark2'))

plot of chunk newsCloud

Twitter.txt

myCorpusTwitter <- tm_map(myCorpusTwitter, PlainTextDocument)
wordcloud(myCorpusTwitter, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8,'Dark2'))

plot of chunk twitterCloud

Calculating bigrams:

Blogs.txt

myCorpusBlogsTemp <- txt.to.words(subBlogs)
bigramB <- make.ngrams(myCorpusBlogsTemp, ngram.size = 2)
bigramB <- table(bigramB)
bigramB <- sort(bigramB, decreasing = TRUE)
bigramB[1:20]

## bigramB
##  of the  in the    it s  to the  on the   to be     i m   don t and the 
##    1028     848     467     464     420     363     329     328     321 
## for the   and i  i have  it was   it is  at the   i was  that i    in a 
##     311     289     287     270     259     256     252     252     245 
##    is a    i am 
##     244     243

News.txt

myCorpusNewsTemp <- txt.to.words(subNews)
bigramN <- make.ngrams(myCorpusNewsTemp, ngram.size = 2)
bigramN <- table(bigramN)
bigramN <- sort(bigramN, decreasing = TRUE)
bigramN[1:20]

## bigramN
##   of the   in the   to the   on the     it s  for the   at the  and the 
##      959      918      401      383      335      332      313      256 
##    to be     in a with the from the  he said     of a  will be   with a 
##      239      237      231      196      175      164      163      163 
##    for a that the     is a    don t 
##      159      157      155      151

Twitter.txt

myCorpusTwitterTemp <- txt.to.words(subTwitter)
bigramT <- make.ngrams(myCorpusTwitterTemp, ngram.size = 2)
bigramT <- table(bigramT)
bigramT <- sort(bigramT, decreasing = TRUE)
bigramT[1:20]

## bigramT
##        i m       it s      don t     in the    for the     of the 
##        266        186        170        166        148        134 
##     on the      to be     you re     to the      can t     that s 
##        111         91         90         89         86         85 
##   going to     have a       i am     if you     at the      for a 
##         79         79         76         76         73         73 
## thanks for      i can 
##         72         70

Creating trigrams:

Blogs.txt

myCorpusBlogsTemp3 <- txt.to.words(subBlogs)
trigramB3 <- make.ngrams(myCorpusBlogsTemp3, ngram.size = 3)
trigramB3 <- data.frame(table(trigramB3))
trigramB3<-trigramB3[order(trigramB3$Freq,decreasing = TRUE),]
tempB<- trigramB3[1:20,]
ggplot(tempB,aes(x=trigramB3,y=Freq)) + geom_bar(stat="identity",fill="maroon") + labs(x="Three Gram Tokens") + labs(y="Frequency") + labs(title="Top 20 Three Gram Tokens") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot of chunk triB

News.txt

myCorpusNewsTemp3 <- txt.to.words(subNews)
trigramN3 <- make.ngrams(myCorpusNewsTemp3, ngram.size = 3)
trigramN3<-data.frame(table(trigramN3))
trigramN3<-trigramN3[order(trigramN3$Freq,decreasing = TRUE),]
tempN<- trigramN3[1:20,]
ggplot(tempN,aes(x=trigramN3,y=Freq)) + geom_bar(stat="identity",fill="maroon") + labs(x="Three Gram Tokens") + labs(y="Frequency") + labs(title="Top 20 Three Gram Tokens") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot of chunk triN

Twitter.txt

myCorpusTwitterTemp3 <- txt.to.words(subTwitter)
trigramT3 <- make.ngrams(myCorpusTwitterTemp3, ngram.size = 3)
trigramT3<-data.frame(table(trigramT3))
trigramT3<-trigramT3[order(trigramT3$Freq,decreasing = TRUE),]
tempT<- trigramT3[1:20,]
ggplot(tempT,aes(x=trigramT3,y=Freq)) + geom_bar(stat="identity",fill="maroon") + labs(x="Three Gram Tokens") + labs(y="Frequency") + labs(title="Top 20 Three Gram Tokens") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot of chunk triT