This Milestone Report is about exploratory data analysis of the Capstone Project of the Data Science Coursera specialization.
Coursera and SwitfKey are partnering on this project; that apply data science in the area of natural language. The project uses a large text corpus of documents to predict the next word on preceding input.
The data is extracted and cleaned from files and used with the Shiny application.
Here, we have some information about the corpus of data and prepare a plan to create the predictive model.
The 3 files provided with language data from blogs, news and tweets are used to build the English language corpus.
The following steps are executed to prepare the data into a corpus.
library(ggplot2)
## Warning: remplacement de l'importation précédente 'lifecycle::last_warnings' par
## 'rlang::last_warnings' lors du chargement de 'pillar'
library(tm)
## Le chargement a nécessité le package : NLP
##
## Attachement du package : 'NLP'
## L'objet suivant est masqué depuis 'package:ggplot2':
##
## annotate
library(stringi)
library(wordcloud)
## Le chargement a nécessité le package : RColorBrewer
library(RColorBrewer)
library(dplyr)
##
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
##
## filter, lag
## Les objets suivants sont masqués depuis 'package:base':
##
## intersect, setdiff, setequal, union
library(slam)
library(data.table)
##
## Attachement du package : 'data.table'
## L'objet suivant est masqué depuis 'package:slam':
##
## rollup
## Les objets suivants sont masqués depuis 'package:dplyr':
##
## between, first, last
library(NLP)
library(RWeka)
# download file, if not present already
if( ! file.exists("Coursera-SwiftKey.zip") ){
src_zip_file <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(src_zip_file, destfile = "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
# look at the source files
zip_files <- unzip("Coursera-SwiftKey.zip", list = T)
zip_files$Date <- NULL
zip_files$Language <- substr(zip_files$Name, 7, 8)
zip_files$Length_in_Mb <- zip_files$Length/(1024^2)
zip_files <- zip_files[zip_files$Length>0,]
# only load twitter
zip_files <- zip_files[ grep("en_US", zip_files$Name), ]
The data is from HC Corpora with access to 4 languages, but only English will be used. The dataset has three files: en_US.blogs.txt, en_US.news.txt, en_US.twitter.txt.
The data was loaded from Coursera Link to local machine and will be read from local disk.
#Read en_US.blogs.txt, en_US.news.txt, en_US.twitter.txt.
blogsURL <- file("en_US.blogs.txt", open="rb")
blogs <- readLines(blogsURL, encoding = "UTF-8", skipNul=TRUE)
newsURL <- file("en_US.news.txt", open = "rb")
news <- readLines(newsURL, encoding = "UTF-8", skipNul=TRUE)
twitterURL <- file("en_US.twitter.txt", open = "rb")
twitter <- readLines(twitterURL, encoding = "UTF-8", skipNul=TRUE)
## Size of Files
S1 <- file.info("en_US.blogs.txt")$size / 1024^2
S2 <- file.info("en_US.news.txt")$size / 1024^2
S3 <- file.info("en_US.twitter.txt")$size / 1024^2
## Number of lines
N1 <- length(blogs)
N2 <- length(news)
N3 <- length(twitter)
## Counting the Words
C1 <- sum(stri_count_words(blogs))
C2 <- sum(stri_count_words(news))
C3 <- sum(stri_count_words(twitter))
## The length of the longest line in any of the three en_US data sets
M1 <- max(nchar(blogs))
M2 <- max(nchar(news))
M3 <- max(nchar(twitter))
resume <- data.frame(
Name = c("Blogs","News","Twitter"),
Size = c(S1, S2, S3),
Nember_of_lines = c(N1, N2, N3),
Nember_of_words = c(C1, C2, C3),
max_size_Line = c(M1, M2, M3)
)
resume
## Name Size Nember_of_lines Nember_of_words max_size_Line
## 1 Blogs 200.4242 899288 37546250 40833
## 2 News 196.2775 1010242 34762395 11384
## 3 Twitter 159.3641 2360148 30093413 140
The data is enormous, so, we must create a subset of the data considering the limited resources for test and application.
set.seed( 1984 )
dspl.blogs <- sample(blogs, 5000, replace = TRUE)
set.seed( 1984 )
dspl.news <- sample(news, 5000, replace = TRUE)
set.seed( 1984 )
dspl.tweets <- sample(twitter , 5000 , replace = TRUE)
# blending texts together
dspl <- c(dspl.blogs, dspl.tweets, dspl.news)
hist( stri_count_words(dspl), breaks=30, col=rainbow(20), main = paste("Number of words distribution for", prettyNum(length(dspl), scientific=FALSE, big.mark=","), "documents" ))
length(dspl)
## [1] 15000
The final text data needs to be cleaned to be used in the word prediction algorithm The objective is to create a cleaned Corpus file or sample of text. This Corpus will be cleaned using methods as removing whitespaces, numbers, UTR, punctuation and so on.
The library used here is TM that loads the corpus into memory and allow calls to the methods to clean the data.
# text mining on sampled data
corpus <- VCorpus(VectorSource(dspl))
# switch encoding: convert character vector from UTF-8 to ASCII
corpus <- tm_map(corpus, function(x) iconv(x, 'UTF-8', 'ASCII', sub="byte"))
corpus <- tm_map(corpus, tolower, lazy = TRUE)
corpus <- tm_map(corpus,removePunctuation, preserve_intra_word_dashes=TRUE)
corpus <- tm_map(corpus, removeNumbers)
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
corpus <- tm_map(corpus, removeURL)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
# assign TEXT flag
corpus <- tm_map(corpus, PlainTextDocument)
for (i in 1:10){
print(corpus[[i]]$content)
}
## [1] " seventh grade math teacher yet another victim evil mind really will see fault"
## [1] " didnt know already norwegians gather party party gin ack wine beer aquavit cognac oh yeah food lets seethere pinekjott sausage sausage mashed turnips yum first time good potatoes stuff brought dessert chocolate cake traditional southern american recipe think confused everyone chocolatyness pound butter calls ate seemed like hey chocolate many people turn warm gooey chocolate"
## [1] " made mosaic will join mary ecthe little red houseed also little blue flowers blue monday smiling sally lovely doors monday doorways "
## [1] " iem adopted sure twin mother marilyn monroe stopped wheeling gave birth youed surprised resemblance iem sure stories true"
## [1] "angel luis juarbe new york ny"
## [1] "iem making money complicated children adventures"
## [1] " years planning years preparing successfully achieving groups goals based logic look amazing paranoid delusional insane man even now looking things logically can continue personal agenda -line bigger agenda started"
## [1] "now days warmer time clean ashes fireplace one last time love idea putting something pretty inside season sometimes use magnolia branches baskets full hydrangeas first image hello color pink pulling heart strings adore texture going rug log stool lend perfect textural balance pouff acrylic coffee table pretty flowers brilliant"
## [1] "tweet include link comments another chance"
## [1] "ecbut going wilt away donet get theyell die tonight latest morning glory doesnet get live longer matter hard tryed clearly reacting now"
## Saving the final corpus
saveRDS(corpus, file = "./finalcorpus.RData")
final_corpus <- readRDS("./finalCorpus.RData")
finalCorpus <-data.frame(text=unlist(sapply(final_corpus,`[`, "content")),stringsAsFactors = FALSE)
In Natural Language Processing (NLP), n-gram is a contiguous sequence of n items from a given sequence of text or speech. Unigrams are single words. Bigrams are two words combinations. Trigrams are three-word combinations.
Let’s read the text to break it into words and sentences, and to turn it into n-grams.
The tokenizer method is allowed in R using the package RWeka.
The following function is used to extract 1-grams, 2-grams, 3-grams and 4-grams from the Corpus using RWeka.
library(RWeka)
## Tokenizer function to get unigrams
unigram <- NGramTokenizer(finalCorpus, Weka_control(min = 1, max = 1,delimiters = " \\r\\n\\t.,;:\"()?!"))
unigram <- data.frame(table(unigram))
unigram <- unigram[order(unigram$Freq,decreasing = TRUE),]
names(unigram) <- c("word1", "freq")
unigram$word1 <- as.character(unigram$word1)
write.csv(unigram[unigram$freq > 1,],"unigram.csv",row.names=F)
unigram <- read.csv("unigram.csv",stringsAsFactors = F)
saveRDS(unigram, file = "unigram.RData")
head(unigram)
## word1 freq
## 1 said 1471
## 2 will 1351
## 3 one 1294
## 4 just 1097
## 5 like 1075
## 6 can 1039
## Unigram Plot
unigram <- readRDS("unigram.RData")
p1 <- ggplot(data=unigram[1:10,], aes(x = word1, y = freq))
p2 <- p1 + geom_bar(stat="identity") + coord_flip() + ggtitle("Frequently Words")
p3 <- p2 + geom_text(data = unigram[1:10,], aes(x = word1, y = freq, label = freq), hjust=-1, position = "identity")
p3
# Tokenizer function to get bigrams
bigram <- NGramTokenizer(finalCorpus, Weka_control(min = 2, max = 2,delimiters = " \\r\\n\\t.,;:\"()?!"))
bigram <- data.frame(table(bigram))
bigram <- bigram[order(bigram$Freq,decreasing = TRUE),]
names(bigram) <- c("words","freq")
head(bigram)
## words freq
## 103415 last year 85
## 127319 new york 82
## 103406 last week 67
## 86580 high school 65
## 159153 right now 65
## 216417 years ago 56
bigram$words <- as.character(bigram$words)
str2 <- strsplit(bigram$words,split=" ")
bigram <- transform(bigram,
one = sapply(str2,"[[",1),
two = sapply(str2,"[[",2))
bigram <- data.frame(word1 = bigram$one,word2 = bigram$two,freq = bigram$freq,stringsAsFactors=FALSE)
## saving files
write.csv(bigram[bigram$freq > 1,],"bigram.csv",row.names=F)
bigram <- read.csv("bigram.csv",stringsAsFactors = F)
saveRDS(bigram,"bigram.RData")
# Tokenizer function to get trigrams
trigram <- NGramTokenizer(finalCorpus, Weka_control(min = 3, max = 3,delimiters = " \\r\\n\\t.,;:\"()?!"))
trigram <- data.frame(table(trigram))
trigram <- trigram[order(trigram$Freq,decreasing = TRUE),]
names(trigram) <- c("words","freq")
head(trigram)
## words freq
## 162631 president barack obama 15
## 12298 ass ass ass 9
## 29720 cant wait see 9
## 100661 im pretty sure 8
## 141389 new york times 8
## 197719 st louis county 8
#####################
trigram$words <- as.character(trigram$words)
str3 <- strsplit(trigram$words,split=" ")
trigram <- transform(trigram,
one = sapply(str3,"[[",1),
two = sapply(str3,"[[",2),
three = sapply(str3,"[[",3))
# trigram$words <- NULL
trigram <- data.frame(word1 = trigram$one,word2 = trigram$two,
word3 = trigram$three, freq = trigram$freq,stringsAsFactors=FALSE)
# saving files
write.csv(trigram[trigram$freq > 1,],"trigram.csv",row.names=F)
trigram <- read.csv("trigram.csv",stringsAsFactors = F)
saveRDS(trigram,"trigram.RData")
# Tokenizer function to get quadgrams
quadgram <- NGramTokenizer(finalCorpus, Weka_control(min = 4, max = 4,delimiters = " \\r\\n\\t.,;:\"()?!"))
quadgram <- data.frame(table(quadgram))
quadgram <- quadgram[order(quadgram$Freq,decreasing = TRUE),]
names(quadgram) <- c("words","freq")
head(quadgram)
## words freq
## 12344 ass ass ass ass 8
## 113435 la la la la 5
## 45771 cricket new zealand australia 4
## 155843 per serving calories g 4
## 179317 ruiz kick dn e 4
## 188440 serving calories g fat 4
##################
quadgram$words <- as.character(quadgram$words)
str4 <- strsplit(quadgram$words,split=" ")
quadgram <- transform(quadgram,
one = sapply(str4,"[[",1),
two = sapply(str4,"[[",2),
three = sapply(str4,"[[",3),
four = sapply(str4,"[[",4))
# quadgram$words <- NULL
quadgram <- data.frame(word1 = quadgram$one,
word2 = quadgram$two,
word3 = quadgram$three,
word4 = quadgram$four,
freq = quadgram$freq, stringsAsFactors=FALSE)
# saving files
write.csv(quadgram[quadgram$freq > 1,],"quadgram.csv",row.names=F)
quadgram <- read.csv("quadgram.csv",stringsAsFactors = F)
saveRDS(quadgram,"quadgram.RData")