Data Exploration - NLP

The goal of this exercise is to do some data exploration of the three datasets provided:

These three files are all natural language and will be used to develop a model that can predict the next word when a user enters a series of two or three word phrases.

require(stringi)
## Loading required package: stringi
require(NLP) 
## Loading required package: NLP
require(tm) 
## Loading required package: tm
## Warning: package 'tm' was built under R version 3.4.1
require(rJava)
## Loading required package: rJava
require(RWeka) 
## Loading required package: RWeka
## Warning: package 'RWeka' was built under R version 3.4.1
require(RWekajars)
## Loading required package: RWekajars
require(RColorBrewer) 
## Loading required package: RColorBrewer
require(qdap)
## Loading required package: qdap
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'qdap'
require(ggplot2) 
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
location <- "WORK"          # Location is either HOME or WORK

set.seed(52)

filename_blogs <- "en_US.blogs.txt"
filename_news <- "en_US.news.txt"
filename_twitter <- "en_US.twitter.txt"

getwd()
## [1] "C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject"
if (location == "HOME") {
        directory_location <- "C:/Users/johns dell/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/Data/"
        filename_profanity <- "C:/Users/johns dell/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/profanitylist.txt"
        
} else {
        directory_location <- "C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/Data/"
        filename_profanity <- "C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/profanitylist.txt"
}
        
setwd(directory_location)

getwd()
## [1] "C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/Data"
blogsfile <- file(filename_blogs, open="rb") 
blogs <- readLines(blogsfile, encoding = "UTF-8", skipNul=TRUE)

newsfile <- file(filename_news, open = "rb")  
news <- readLines(newsfile, encoding = "UTF-8", skipNul=TRUE)

twitterfile <- file(filename_twitter, open = "rb") 
twitter <- readLines(twitterfile, encoding = "UTF-8", skipNul=TRUE)

profanitywordslist <- readLines(filename_profanity)
## Warning in readLines(filename_profanity): incomplete final line found on
## 'C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class
## 10/FinalProject/en_US/profanitylist.txt'
## Size of Files

file.info(filename_blogs)$size/1024^2 
## [1] 200.4242
file.info(filename_news)$size/1024^2 
## [1] 196.2775
file.info(filename_twitter)$size / 1024^2
## [1] 159.3641
## Number of lines

length(blogs)
## [1] 899288
length(news) 
## [1] 1010242
length(twitter)
## [1] 2360148
## Counting the Words

sum(stri_count_words(blogs))  
## [1] 37546246
sum(stri_count_words(news))   
## [1] 34762395
sum(stri_count_words(twitter))  
## [1] 30093410
## The length of the longest line seen in any of the three en_US data sets 

max(nchar(blogs))  
## [1] 40833
max(nchar(news))    
## [1] 11384
max(nchar(twitter))  
## [1] 140
set.seed(52)
sample_twitter <- sample(twitter, size = 2000, replace = TRUE)
sample_blogs <- sample(blogs, size = 2000, replace = TRUE)
sample_news <- sample(news, size = 2000, replace = TRUE)
sample_total <- c(sample_twitter, sample_blogs, sample_news)
length(sample_total)
## [1] 6000
writeLines(sample_total, "thesample.txt")
cleanup_text <- content_transformer(function(doc)
{
  #  The following commands make substitutions to clean up the text
        
  doc <- gsub( "(f|ht)tp(s?)://(.*)[.][a-z]+", "", doc)
  doc <- gsub( '(f|ht)tp\\S+\\s*',"", doc)
  doc <- gsub( "([0-9])(st|nd|rd|th)", "\\1", doc)
  doc <- gsub( "[^a-z.' ]", " ", doc)
  doc <- gsub( "www\\.[a-z]+\\.[a-z]+", "", doc)
  doc <- gsub( "\\.", " ", doc)
  doc <- gsub( "\\'+ \\'+", " ", doc)
  doc <- gsub( "(\\'+ )+|( \\'+)+|^\\'+|\\'+$", " ", doc)
  doc <- gsub( "^[a-z]+$", "", doc)
  doc <- gsub( "( [^ai])+ |^([^ai] )+|( [^ai])+$", " ", doc)
  doc <- gsub( "^ +| +$|", "", doc)
  doc <- gsub( "'","",doc)
  
  return(doc)
})
## Using the TM Package to clean the Corpus Text

textCon <- file("thesample.txt")

thecorpus <- readLines(textCon)

thecorpus <- Corpus(VectorSource(thecorpus))  

## Using the TM Package to clean the text

thecorpus <- tm_map(thecorpus, content_transformer(function(x) iconv(x, to="UTF-8", sub="byte")))

thecorpus <- tm_map(thecorpus, content_transformer(tolower))  
thecorpus <- tm_map(thecorpus, content_transformer(removePunctuation), preserve_intra_word_dashes=TRUE)  

# Removing Profanity Words

thecorpus <- tm_map(thecorpus,removeWords, profanitywordslist)

thecorpus <- tm_map(thecorpus, content_transformer(removeNumbers)) 

thecorpus <- tm_map(thecorpus, stripWhitespace)  

thecorpus <- tm_map(thecorpus, cleanup_text)

thecorpus <- tm_map(thecorpus, PlainTextDocument) 

summary(thecorpus)
##         Length Class             Mode
## content 2      PlainTextDocument list
## meta    2      PlainTextDocument list
saveRDS(thecorpus, file = "thecorpus.RData")

#thecorpus <- readRDS("thecorpus.RData")
unigram <- NGramTokenizer(thecorpus, Weka_control(min = 1, max = 1,delimiters = " \\r\\n\\t.,;:\"()?!"))
unigram <- data.frame(table(unigram))
unigram <- unigram[order(unigram$Freq,decreasing = TRUE),]

names(unigram) <- c("word1", "frequency")
head(unigram)
##       word1 frequency
## 18010   the      9052
## 18256    to      4901
## 653     and      4586
## 12        a      4236
## 12430    of      3873
## 8915     in      2979
unigram$word1 <- as.character(unigram$word1)

write.csv(unigram[unigram$frequency > 1,],"unigram.csv",row.names=F)
unigram <- read.csv("unigram.csv",stringsAsFactors = F)
saveRDS(unigram, file = "unigram.RData")

## Unigram Plot

unigram <- readRDS("unigram.RData")

unigram_plotdata <- unigram$word1[1:10]
unigram_print_frame <- data.frame(words=unigram_plotdata[1:10], count=unigram[1:10,]$frequency)

p1 <- ggplot(data=unigram_print_frame, aes(x = words, y = count))
p1 <- p1 + geom_bar(stat="identity", color="black", fill="blue") 
p1 <- p1 + ggtitle("Frequently Used Unigrams") + theme(plot.title = element_text(hjust=0.5))
p1 <- p1 + theme(axis.text.x=element_text(angle=90))
p1

rm(unigram)
# Tokenizer function to get bigrams
bigram <- NGramTokenizer(thecorpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
bigram <- data.frame(table(bigram))
bigram <- bigram[order(bigram$Freq,decreasing = TRUE),]
names(bigram) <- c("words","frequency")
head(bigram)
##         words frequency
## 64747  of the       874
## 46232  in the       770
## 97665  to the       427
## 65923  on the       392
## 34675 for the       339
## 96486   to be       300
bigram$words <- as.character(bigram$words)
str2 <- strsplit(bigram$words,split=" ")
bigram <- transform(bigram, 
                    one = sapply(str2,"[[",1),   
                    two = sapply(str2,"[[",2))
bigram <- data.frame(word1 = bigram$one, word2=bigram$two, frequency=bigram$frequency, stringsAsFactors=FALSE)

write.csv(bigram[bigram$frequency > 1,],"bigram.csv",row.names=F)
bigram <- read.csv("bigram.csv",stringsAsFactors = F)
saveRDS(bigram,"bigram.RData")

bigram <- readRDS("bigram.RData")

bigram_plotdata <- paste(bigram$word1[1:10], bigram$word2[1:10])
bigram_print_frame <- data.frame(words=bigram_plotdata[1:10], count=bigram[1:10,]$frequency)

p2 <- ggplot(data=bigram_print_frame, aes(x = words, y = count))
p2 <- p2 + geom_bar(stat="identity", color="black", fill="blue") 
p2 <- p2 + ggtitle("Frequently Used Bigrams") + theme(plot.title = element_text(hjust=0.5))
p2 <- p2 + theme(axis.text.x=element_text(angle=90))
p2

rm(bigram)
# Tokenizer function to get trigrams

trigram <- NGramTokenizer(thecorpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
trigram <- data.frame(table(trigram))
trigram <- trigram[order(trigram$Freq,decreasing = TRUE),]
names(trigram) <- c("words","frequency")
head(trigram)
##              words frequency
## 97539   one of the        67
## 1860      a lot of        52
## 99994   out of the        40
## 140804     to be a        37
## 19126   be able to        34
## 101341 part of the        32
trigram$words <- as.character(trigram$words)
str3 <- strsplit(trigram$words,split=" ")
trigram <- transform(trigram,
                     one = sapply(str3,"[[",1),
                     two = sapply(str3,"[[",2),
                     three = sapply(str3,"[[",3))
trigram <- data.frame(word1=trigram$one,
                      word2=trigram$two, 
                      word3=trigram$three, frequency=trigram$frequency,stringsAsFactors=FALSE)
write.csv(trigram[trigram$frequency > 1,],"trigram.csv",row.names=F)
trigram <- read.csv("trigram.csv",stringsAsFactors = F)
saveRDS(trigram,"trigram.RData")

trigram <- readRDS("trigram.RData")

trigram_plotdata <- paste(trigram$word1[1:10], trigram$word2[1:10], trigram$word3[1:10])
trigram_print_frame <- data.frame(words=trigram_plotdata[1:10], count=trigram[1:10,]$frequency)

p3 <- ggplot(data=trigram_print_frame, aes(x = words, y = count))
p3 <- p3 + geom_bar(stat="identity", color="black", fill="blue") 
p3 <- p3 + ggtitle("Frequently Used Trigrams") + theme(plot.title = element_text(hjust=0.5))
p3 <- p3 + theme(axis.text.x=element_text(angle=90))
p3

rm(trigram)
# Tokenizer function to get quadgrams
quadgram <- NGramTokenizer(thecorpus, Weka_control(min = 4, max = 4,delimiters = " \\r\\n\\t.,;:\"()?!"))
quadgram <- data.frame(table(quadgram))
quadgram <- quadgram[order(quadgram$Freq,decreasing = TRUE),]

names(quadgram) <- c("words","frequency")
quadgram$words <- as.character(quadgram$words)

str4 <- strsplit(quadgram$words,split=" ")
quadgram <- transform(quadgram,
                      one = sapply(str4,"[[",1),
                      two = sapply(str4,"[[",2),
                      three = sapply(str4,"[[",3), 
                      four = sapply(str4,"[[",4))
quadgram <- data.frame(word1=quadgram$one,
                       word2=quadgram$two, 
                       word3=quadgram$three, 
                       word4=quadgram$four, 
                       frequency=quadgram$frequency, stringsAsFactors=FALSE)
write.csv(quadgram[quadgram$frequency > 1,],"quadgram.csv",row.names=F)
quadgram <- read.csv("quadgram.csv",stringsAsFactors = F)
saveRDS(quadgram,"quadgram.RData")

quadgram <- readRDS("quadgram.RData")

quadgram_plotdata <- paste(quadgram$word1[1:10], quadgram$word2[1:10], quadgram$word3[1:10], quadgram$word4[1:10])
quadgram_print_frame <- data.frame(words=quadgram_plotdata[1:10], count=quadgram[1:10,]$frequency)

p4 <- ggplot(data=quadgram_print_frame, aes(x = words, y = count))
p4 <- p4 + geom_bar(stat="identity", color="black", fill="blue") 
p4 <- p4 + ggtitle("Frequently Used Quadgrams") + theme(plot.title = element_text(hjust=0.5))
p4 <- p4 + theme(axis.text.x=element_text(angle=90))
p4

rm(quadgram)
# Tokenizer function to get fivegrams
fivegram <- NGramTokenizer(thecorpus, Weka_control(min = 5, max = 5,delimiters = " \\r\\n\\t.,;:\"()?!"))
fivegram <- data.frame(table(fivegram))
fivegram <- fivegram[order(fivegram$Freq,decreasing = TRUE),]

names(fivegram) <- c("words","frequency")
fivegram$words <- as.character(fivegram$words)

str5 <- strsplit(fivegram$words,split=" ")
fivegram <- transform(fivegram,
                      one = sapply(str5,"[[",1),
                      two = sapply(str5,"[[",2),
                      three = sapply(str5,"[[",3), 
                      four = sapply(str5,"[[",4),
                      five = sapply(str5,"[[",5)) 
fivegram <- data.frame(word1=fivegram$one,
                       word2=fivegram$two, 
                       word3=fivegram$three, 
                       word4=fivegram$four, 
                       word5=fivegram$five,
                       frequency=fivegram$frequency, stringsAsFactors=FALSE)
write.csv(fivegram[fivegram$frequency > 1,],"fivegram.csv",row.names=F)
fivegram <- read.csv("fivegram.csv",stringsAsFactors = F)
saveRDS(fivegram,"fivegram.RData")

fivegram <- readRDS("fivegram.RData")

fivegram_plotdata <- paste(fivegram$word1[1:10], fivegram$word2[1:10], fivegram$word3[1:10], fivegram$word4[1:10], fivegram$word5[1:10])
fivegram_print_frame <- data.frame(words=fivegram_plotdata[1:10], count=fivegram[1:10,]$frequency)

p5 <- ggplot(data=fivegram_print_frame, aes(x = words, y = count))
p5 <- p5 + geom_bar(stat="identity", color="black", fill="blue") 
p5 <- p5 + ggtitle("Frequently Used Fivegrams") + theme(plot.title = element_text(hjust=0.5))
p5 <- p5 + theme(axis.text.x=element_text(angle=90))
p5

rm(fivegram)
# Tokenizer function to get sixgrams
sixgram <- NGramTokenizer(thecorpus, Weka_control(min = 6, max = 6,delimiters = " \\r\\n\\t.,;:\"()?!"))
sixgram <- data.frame(table(sixgram))
sixgram <- sixgram[order(sixgram$Freq,decreasing = TRUE),]

names(sixgram) <- c("words","frequency")
sixgram$words <- as.character(sixgram$words)

str6 <- strsplit(sixgram$words,split=" ")
sixgram <- transform(sixgram,
                      one = sapply(str6,"[[",1),
                      two = sapply(str6,"[[",2),
                      three = sapply(str6,"[[",3), 
                      four = sapply(str6,"[[",4),
                      five = sapply(str6,"[[",5),
                      six = sapply(str6,"[[",6)) 
sixgram <- data.frame(word1=sixgram$one,
                       word2=sixgram$two, 
                       word3=sixgram$three, 
                       word4=sixgram$four, 
                       word5=sixgram$five,
                       word6=sixgram$six,
                       frequency=sixgram$frequency, stringsAsFactors=FALSE)
write.csv(sixgram[sixgram$frequency > 1,],"sixgram.csv",row.names=F)
sixgram <- read.csv("sixgram.csv",stringsAsFactors = F)
saveRDS(sixgram,"sixgram.RData")

sixgram <- readRDS("sixgram.RData")

sixgram_plotdata <- paste(sixgram$word1[1:10], sixgram$word2[1:10], sixgram$word3[1:10], sixgram$word4[1:10], sixgram$word5[1:10], sixgram$word6[1:10])
sixgram_print_frame <- data.frame(words=sixgram_plotdata[1:10], count=sixgram[1:10,]$frequency)

p6 <- ggplot(data=sixgram_print_frame, aes(x = words, y = count))
p6 <- p6 + geom_bar(stat="identity", color="black", fill="blue") 
p6 <- p6 + ggtitle("Frequently Used Sixgrams") + theme(plot.title = element_text(hjust=0.5))
p6 <- p6 + theme(axis.text.x=element_text(angle=90))
p6

rm(sixgram)