The goal of this exercise is to do some data exploration of the three datasets provided:
These three files are all natural language and will be used to develop a model that can predict the next word when a user enters a series of two or three word phrases.
require(stringi)
## Loading required package: stringi
require(NLP)
## Loading required package: NLP
require(tm)
## Loading required package: tm
## Warning: package 'tm' was built under R version 3.4.1
require(rJava)
## Loading required package: rJava
require(RWeka)
## Loading required package: RWeka
## Warning: package 'RWeka' was built under R version 3.4.1
require(RWekajars)
## Loading required package: RWekajars
require(RColorBrewer)
## Loading required package: RColorBrewer
require(qdap)
## Loading required package: qdap
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'qdap'
require(ggplot2)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
location <- "WORK" # Location is either HOME or WORK
set.seed(52)
filename_blogs <- "en_US.blogs.txt"
filename_news <- "en_US.news.txt"
filename_twitter <- "en_US.twitter.txt"
getwd()
## [1] "C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject"
if (location == "HOME") {
directory_location <- "C:/Users/johns dell/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/Data/"
filename_profanity <- "C:/Users/johns dell/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/profanitylist.txt"
} else {
directory_location <- "C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/Data/"
filename_profanity <- "C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/profanitylist.txt"
}
setwd(directory_location)
getwd()
## [1] "C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class 10/FinalProject/en_US/Data"
blogsfile <- file(filename_blogs, open="rb")
blogs <- readLines(blogsfile, encoding = "UTF-8", skipNul=TRUE)
newsfile <- file(filename_news, open = "rb")
news <- readLines(newsfile, encoding = "UTF-8", skipNul=TRUE)
twitterfile <- file(filename_twitter, open = "rb")
twitter <- readLines(twitterfile, encoding = "UTF-8", skipNul=TRUE)
profanitywordslist <- readLines(filename_profanity)
## Warning in readLines(filename_profanity): incomplete final line found on
## 'C:/Users/jmcconn3/Dropbox/DataScienceSpecializationJHU/Capstone - Class
## 10/FinalProject/en_US/profanitylist.txt'
## Size of Files
file.info(filename_blogs)$size/1024^2
## [1] 200.4242
file.info(filename_news)$size/1024^2
## [1] 196.2775
file.info(filename_twitter)$size / 1024^2
## [1] 159.3641
## Number of lines
length(blogs)
## [1] 899288
length(news)
## [1] 1010242
length(twitter)
## [1] 2360148
## Counting the Words
sum(stri_count_words(blogs))
## [1] 37546246
sum(stri_count_words(news))
## [1] 34762395
sum(stri_count_words(twitter))
## [1] 30093410
## The length of the longest line seen in any of the three en_US data sets
max(nchar(blogs))
## [1] 40833
max(nchar(news))
## [1] 11384
max(nchar(twitter))
## [1] 140
set.seed(52)
sample_twitter <- sample(twitter, size = 2000, replace = TRUE)
sample_blogs <- sample(blogs, size = 2000, replace = TRUE)
sample_news <- sample(news, size = 2000, replace = TRUE)
sample_total <- c(sample_twitter, sample_blogs, sample_news)
length(sample_total)
## [1] 6000
writeLines(sample_total, "thesample.txt")
cleanup_text <- content_transformer(function(doc)
{
# The following commands make substitutions to clean up the text
doc <- gsub( "(f|ht)tp(s?)://(.*)[.][a-z]+", "", doc)
doc <- gsub( '(f|ht)tp\\S+\\s*',"", doc)
doc <- gsub( "([0-9])(st|nd|rd|th)", "\\1", doc)
doc <- gsub( "[^a-z.' ]", " ", doc)
doc <- gsub( "www\\.[a-z]+\\.[a-z]+", "", doc)
doc <- gsub( "\\.", " ", doc)
doc <- gsub( "\\'+ \\'+", " ", doc)
doc <- gsub( "(\\'+ )+|( \\'+)+|^\\'+|\\'+$", " ", doc)
doc <- gsub( "^[a-z]+$", "", doc)
doc <- gsub( "( [^ai])+ |^([^ai] )+|( [^ai])+$", " ", doc)
doc <- gsub( "^ +| +$|", "", doc)
doc <- gsub( "'","",doc)
return(doc)
})
## Using the TM Package to clean the Corpus Text
textCon <- file("thesample.txt")
thecorpus <- readLines(textCon)
thecorpus <- Corpus(VectorSource(thecorpus))
## Using the TM Package to clean the text
thecorpus <- tm_map(thecorpus, content_transformer(function(x) iconv(x, to="UTF-8", sub="byte")))
thecorpus <- tm_map(thecorpus, content_transformer(tolower))
thecorpus <- tm_map(thecorpus, content_transformer(removePunctuation), preserve_intra_word_dashes=TRUE)
# Removing Profanity Words
thecorpus <- tm_map(thecorpus,removeWords, profanitywordslist)
thecorpus <- tm_map(thecorpus, content_transformer(removeNumbers))
thecorpus <- tm_map(thecorpus, stripWhitespace)
thecorpus <- tm_map(thecorpus, cleanup_text)
thecorpus <- tm_map(thecorpus, PlainTextDocument)
summary(thecorpus)
## Length Class Mode
## content 2 PlainTextDocument list
## meta 2 PlainTextDocument list
saveRDS(thecorpus, file = "thecorpus.RData")
#thecorpus <- readRDS("thecorpus.RData")
unigram <- NGramTokenizer(thecorpus, Weka_control(min = 1, max = 1,delimiters = " \\r\\n\\t.,;:\"()?!"))
unigram <- data.frame(table(unigram))
unigram <- unigram[order(unigram$Freq,decreasing = TRUE),]
names(unigram) <- c("word1", "frequency")
head(unigram)
## word1 frequency
## 18010 the 9052
## 18256 to 4901
## 653 and 4586
## 12 a 4236
## 12430 of 3873
## 8915 in 2979
unigram$word1 <- as.character(unigram$word1)
write.csv(unigram[unigram$frequency > 1,],"unigram.csv",row.names=F)
unigram <- read.csv("unigram.csv",stringsAsFactors = F)
saveRDS(unigram, file = "unigram.RData")
## Unigram Plot
unigram <- readRDS("unigram.RData")
unigram_plotdata <- unigram$word1[1:10]
unigram_print_frame <- data.frame(words=unigram_plotdata[1:10], count=unigram[1:10,]$frequency)
p1 <- ggplot(data=unigram_print_frame, aes(x = words, y = count))
p1 <- p1 + geom_bar(stat="identity", color="black", fill="blue")
p1 <- p1 + ggtitle("Frequently Used Unigrams") + theme(plot.title = element_text(hjust=0.5))
p1 <- p1 + theme(axis.text.x=element_text(angle=90))
p1
rm(unigram)
# Tokenizer function to get bigrams
bigram <- NGramTokenizer(thecorpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
bigram <- data.frame(table(bigram))
bigram <- bigram[order(bigram$Freq,decreasing = TRUE),]
names(bigram) <- c("words","frequency")
head(bigram)
## words frequency
## 64747 of the 874
## 46232 in the 770
## 97665 to the 427
## 65923 on the 392
## 34675 for the 339
## 96486 to be 300
bigram$words <- as.character(bigram$words)
str2 <- strsplit(bigram$words,split=" ")
bigram <- transform(bigram,
one = sapply(str2,"[[",1),
two = sapply(str2,"[[",2))
bigram <- data.frame(word1 = bigram$one, word2=bigram$two, frequency=bigram$frequency, stringsAsFactors=FALSE)
write.csv(bigram[bigram$frequency > 1,],"bigram.csv",row.names=F)
bigram <- read.csv("bigram.csv",stringsAsFactors = F)
saveRDS(bigram,"bigram.RData")
bigram <- readRDS("bigram.RData")
bigram_plotdata <- paste(bigram$word1[1:10], bigram$word2[1:10])
bigram_print_frame <- data.frame(words=bigram_plotdata[1:10], count=bigram[1:10,]$frequency)
p2 <- ggplot(data=bigram_print_frame, aes(x = words, y = count))
p2 <- p2 + geom_bar(stat="identity", color="black", fill="blue")
p2 <- p2 + ggtitle("Frequently Used Bigrams") + theme(plot.title = element_text(hjust=0.5))
p2 <- p2 + theme(axis.text.x=element_text(angle=90))
p2
rm(bigram)
# Tokenizer function to get trigrams
trigram <- NGramTokenizer(thecorpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
trigram <- data.frame(table(trigram))
trigram <- trigram[order(trigram$Freq,decreasing = TRUE),]
names(trigram) <- c("words","frequency")
head(trigram)
## words frequency
## 97539 one of the 67
## 1860 a lot of 52
## 99994 out of the 40
## 140804 to be a 37
## 19126 be able to 34
## 101341 part of the 32
trigram$words <- as.character(trigram$words)
str3 <- strsplit(trigram$words,split=" ")
trigram <- transform(trigram,
one = sapply(str3,"[[",1),
two = sapply(str3,"[[",2),
three = sapply(str3,"[[",3))
trigram <- data.frame(word1=trigram$one,
word2=trigram$two,
word3=trigram$three, frequency=trigram$frequency,stringsAsFactors=FALSE)
write.csv(trigram[trigram$frequency > 1,],"trigram.csv",row.names=F)
trigram <- read.csv("trigram.csv",stringsAsFactors = F)
saveRDS(trigram,"trigram.RData")
trigram <- readRDS("trigram.RData")
trigram_plotdata <- paste(trigram$word1[1:10], trigram$word2[1:10], trigram$word3[1:10])
trigram_print_frame <- data.frame(words=trigram_plotdata[1:10], count=trigram[1:10,]$frequency)
p3 <- ggplot(data=trigram_print_frame, aes(x = words, y = count))
p3 <- p3 + geom_bar(stat="identity", color="black", fill="blue")
p3 <- p3 + ggtitle("Frequently Used Trigrams") + theme(plot.title = element_text(hjust=0.5))
p3 <- p3 + theme(axis.text.x=element_text(angle=90))
p3
rm(trigram)
# Tokenizer function to get quadgrams
quadgram <- NGramTokenizer(thecorpus, Weka_control(min = 4, max = 4,delimiters = " \\r\\n\\t.,;:\"()?!"))
quadgram <- data.frame(table(quadgram))
quadgram <- quadgram[order(quadgram$Freq,decreasing = TRUE),]
names(quadgram) <- c("words","frequency")
quadgram$words <- as.character(quadgram$words)
str4 <- strsplit(quadgram$words,split=" ")
quadgram <- transform(quadgram,
one = sapply(str4,"[[",1),
two = sapply(str4,"[[",2),
three = sapply(str4,"[[",3),
four = sapply(str4,"[[",4))
quadgram <- data.frame(word1=quadgram$one,
word2=quadgram$two,
word3=quadgram$three,
word4=quadgram$four,
frequency=quadgram$frequency, stringsAsFactors=FALSE)
write.csv(quadgram[quadgram$frequency > 1,],"quadgram.csv",row.names=F)
quadgram <- read.csv("quadgram.csv",stringsAsFactors = F)
saveRDS(quadgram,"quadgram.RData")
quadgram <- readRDS("quadgram.RData")
quadgram_plotdata <- paste(quadgram$word1[1:10], quadgram$word2[1:10], quadgram$word3[1:10], quadgram$word4[1:10])
quadgram_print_frame <- data.frame(words=quadgram_plotdata[1:10], count=quadgram[1:10,]$frequency)
p4 <- ggplot(data=quadgram_print_frame, aes(x = words, y = count))
p4 <- p4 + geom_bar(stat="identity", color="black", fill="blue")
p4 <- p4 + ggtitle("Frequently Used Quadgrams") + theme(plot.title = element_text(hjust=0.5))
p4 <- p4 + theme(axis.text.x=element_text(angle=90))
p4
rm(quadgram)
# Tokenizer function to get fivegrams
fivegram <- NGramTokenizer(thecorpus, Weka_control(min = 5, max = 5,delimiters = " \\r\\n\\t.,;:\"()?!"))
fivegram <- data.frame(table(fivegram))
fivegram <- fivegram[order(fivegram$Freq,decreasing = TRUE),]
names(fivegram) <- c("words","frequency")
fivegram$words <- as.character(fivegram$words)
str5 <- strsplit(fivegram$words,split=" ")
fivegram <- transform(fivegram,
one = sapply(str5,"[[",1),
two = sapply(str5,"[[",2),
three = sapply(str5,"[[",3),
four = sapply(str5,"[[",4),
five = sapply(str5,"[[",5))
fivegram <- data.frame(word1=fivegram$one,
word2=fivegram$two,
word3=fivegram$three,
word4=fivegram$four,
word5=fivegram$five,
frequency=fivegram$frequency, stringsAsFactors=FALSE)
write.csv(fivegram[fivegram$frequency > 1,],"fivegram.csv",row.names=F)
fivegram <- read.csv("fivegram.csv",stringsAsFactors = F)
saveRDS(fivegram,"fivegram.RData")
fivegram <- readRDS("fivegram.RData")
fivegram_plotdata <- paste(fivegram$word1[1:10], fivegram$word2[1:10], fivegram$word3[1:10], fivegram$word4[1:10], fivegram$word5[1:10])
fivegram_print_frame <- data.frame(words=fivegram_plotdata[1:10], count=fivegram[1:10,]$frequency)
p5 <- ggplot(data=fivegram_print_frame, aes(x = words, y = count))
p5 <- p5 + geom_bar(stat="identity", color="black", fill="blue")
p5 <- p5 + ggtitle("Frequently Used Fivegrams") + theme(plot.title = element_text(hjust=0.5))
p5 <- p5 + theme(axis.text.x=element_text(angle=90))
p5
rm(fivegram)
# Tokenizer function to get sixgrams
sixgram <- NGramTokenizer(thecorpus, Weka_control(min = 6, max = 6,delimiters = " \\r\\n\\t.,;:\"()?!"))
sixgram <- data.frame(table(sixgram))
sixgram <- sixgram[order(sixgram$Freq,decreasing = TRUE),]
names(sixgram) <- c("words","frequency")
sixgram$words <- as.character(sixgram$words)
str6 <- strsplit(sixgram$words,split=" ")
sixgram <- transform(sixgram,
one = sapply(str6,"[[",1),
two = sapply(str6,"[[",2),
three = sapply(str6,"[[",3),
four = sapply(str6,"[[",4),
five = sapply(str6,"[[",5),
six = sapply(str6,"[[",6))
sixgram <- data.frame(word1=sixgram$one,
word2=sixgram$two,
word3=sixgram$three,
word4=sixgram$four,
word5=sixgram$five,
word6=sixgram$six,
frequency=sixgram$frequency, stringsAsFactors=FALSE)
write.csv(sixgram[sixgram$frequency > 1,],"sixgram.csv",row.names=F)
sixgram <- read.csv("sixgram.csv",stringsAsFactors = F)
saveRDS(sixgram,"sixgram.RData")
sixgram <- readRDS("sixgram.RData")
sixgram_plotdata <- paste(sixgram$word1[1:10], sixgram$word2[1:10], sixgram$word3[1:10], sixgram$word4[1:10], sixgram$word5[1:10], sixgram$word6[1:10])
sixgram_print_frame <- data.frame(words=sixgram_plotdata[1:10], count=sixgram[1:10,]$frequency)
p6 <- ggplot(data=sixgram_print_frame, aes(x = words, y = count))
p6 <- p6 + geom_bar(stat="identity", color="black", fill="blue")
p6 <- p6 + ggtitle("Frequently Used Sixgrams") + theme(plot.title = element_text(hjust=0.5))
p6 <- p6 + theme(axis.text.x=element_text(angle=90))
p6
rm(sixgram)