The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.
Tasks to accomplish:
Exploratory analysis:
perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.
setwd("~/Data Science/Jhon Hopkins/Capstone/Week2")
library(ggplot2)
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.5.2
## Package version: 1.3.14
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
The following app and report will be built using only english textdata therefore we will select data from twitter blogs and news only in english lenguage
list.files()
## [1] "dpb-colloc01.pdf"
## [2] "en_US.blogs.txt"
## [3] "en_US.news.txt"
## [4] "en_US.twitter.txt"
## [5] "kbot_complete_hand_written_example.pdf"
## [6] "Milestone.html"
## [7] "Milestone.Rmd"
## [8] "Milestone_cache"
## [9] "Milestone_files"
## [10] "muestra.txt"
## [11] "W2milestone.R"
file.info("en_US.blogs.txt")$size/1024^2
## [1] 200.4242
file.info("en_US.news.txt")$size/1024^2
## [1] 196.2775
file.info("en_US.twitter.txt")$size/1024^2
## [1] 159.3641
The average file size is in the order of 200Mb which is certainly not a trivial size
blogs<-readLines("en_US.blogs.txt",skipNul = TRUE, warn = TRUE)
news<-readLines("en_US.news.txt",skipNul = TRUE, warn = TRUE)
## Warning in readLines("en_US.news.txt", skipNul = TRUE, warn = TRUE):
## incomplete final line found on 'en_US.news.txt'
twitter<-readLines("en_US.twitter.txt",skipNul = TRUE,warn = TRUE)
length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148
max(nchar(blogs))
## [1] 40835
max(nchar(news))
## [1] 5760
max(nchar(twitter))
## [1] 213
mean(nchar(blogs))
## [1] 231.696
mean(nchar(news))
## [1] 203.0024
mean(nchar(twitter))
## [1] 68.8029
summary(nchar(blogs))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 47.0 157.0 231.7 331.0 40835.0
summary(nchar(news))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 111 186 203 270 5760
summary(nchar(twitter))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 37.0 64.0 68.8 100.0 213.0
The data is to massive to process it completely specially if we consider constrains related to mobile cellphone memory therefore just a sample will be used for the models generation
set.seed(3007)
sam_blogs<-sample(blogs,size = 3000,replace = TRUE)
sam_news<-sample(news,size = 3000,replace = TRUE)
sam_twitter<-sample(twitter,size = 3000,replace = TRUE)
muestra<-c(sam_blogs,sam_news,sam_twitter)
writeLines(muestra,"muestra.txt")
rm(blogs,twitter,news)
rm(sam_blogs,sam_news,sam_twitter)
muestra<-as.data.frame(muestra)
names(muestra)<-c("text")
muestra$text<-as.character(muestra$text)
nrow(muestra)
## [1] 9000
It is important to verify if there are missing values on the data set, specially after unification
length(which(!complete.cases(muestra)))
## [1] 0
We can confirm that the sample has no missing values and therefore we are good to proceed forward
For most of the cleaning and tokenization process we are going to use the quanteda package functions
train.tokens<-tokens(muestra$text,what="word",remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_separators = TRUE,
remove_twitter = TRUE, remove_hyphens = TRUE, remove_url = TRUE)
train.tokens<-tokens_tolower(train.tokens)
train.tokens.dfm <- dfm(train.tokens,tolower = FALSE)
text data is already tokenized what we need to do next is to calculate the frequency of each of the token words
unigram_freq<-colSums(train.tokens.dfm)
termFreq <- data.frame(unigram=names(unigram_freq), frequency=unigram_freq)
termFreq <-arrange(termFreq,desc(termFreq$frequency))
top10<-head(termFreq,10)
top10
## unigram frequency
## 1 the 13267
## 2 to 7255
## 3 and 6659
## 4 a 6256
## 5 of 5774
## 6 in 4565
## 7 i 3876
## 8 that 2803
## 9 is 2787
## 10 for 2760
Plotting the unigram
g1<-ggplot(data = top10,aes(x=reorder(unigram,-frequency),y=top10$frequency))
g1<-g1 + geom_bar(stat = "identity",fill = "steelblue")+geom_text(data = top10,aes(x=top10$unigram,y=top10$frequency,label = top10$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g1<-g1 + ggtitle("Unigram Top 10 Frequently Words") + xlab("Unigrams") + ylab("Frequency")
g1
To create n>1 grams we will use the quanteda function tokens_ngrams
bigram.token <-tokens_ngrams(train.tokens,n=2)
bigram.token.dfm <- dfm(bigram.token,tolower = FALSE)
bigram_freq<-colSums(bigram.token.dfm)
bigram_termfreq<-data.frame(bigram=names(bigram_freq),frequency=bigram_freq)
bigram_termfreq<-arrange(bigram_termfreq,desc(bigram_termfreq$frequency))
top10bigram<-head(bigram_termfreq,10)
top10bigram
## bigram frequency
## 1 of_the 1259
## 2 in_the 1172
## 3 to_the 599
## 4 on_the 513
## 5 for_the 496
## 6 to_be 423
## 7 at_the 374
## 8 in_a 362
## 9 and_the 354
## 10 with_the 320
Plotting the bigram
g2<-ggplot(data = top10bigram,aes(x=reorder(bigram,-frequency),y=top10bigram$frequency))
g2<-g2 + geom_bar(stat = "identity",fill = "seagreen4")+geom_text(data = top10bigram,aes(x=top10bigram$bigram,y=top10bigram$frequency,label = top10bigram$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g2<-g2 + ggtitle("Bigram Top 10 Frequently Words") + xlab("Bigrams") + ylab("Frequency")
g2
trigram.token <-tokens_ngrams(train.tokens,n=3)
trigram.token.dfm<-dfm(trigram.token,tolower = FALSE)
trigram_freq<-colSums(trigram.token.dfm)
trigram_termfreq<-data.frame(trigram=names(trigram_freq),frequency=trigram_freq)
trigram_termfreq<-arrange(trigram_termfreq,desc(trigram_termfreq$frequency))
top10trigram<-head(trigram_termfreq,10)
Plotting trigrams
g3<-ggplot(data = top10trigram,aes(x=reorder(trigram,-frequency),y=top10trigram$frequency))
g3<-g3 + geom_bar(stat = "identity",fill = "red4")+geom_text(data = top10trigram,aes(x=top10trigram$trigram,y=top10trigram$frequency,label = top10trigram$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g3<-g3 + ggtitle("Trigram Top 10 Frequently Words") + xlab("Trigrams") + ylab("Frequency")
g3
quadgram.token <-tokens_ngrams(train.tokens,n=4)
quadgram.token.dfm<-dfm(quadgram.token,tolower=FALSE)
quadgram_freq<-colSums(quadgram.token.dfm)
quadgram_termfreq<-data.frame(quadgram=names(quadgram_freq),frequency=quadgram_freq)
quadgram_termfreq<-arrange(quadgram_termfreq,desc(quadgram_termfreq$frequency))
top10quadgram<-head(quadgram_termfreq,10)
Plotting Quadgrams
g4<-ggplot(data = top10quadgram,aes(x=reorder(quadgram,-frequency),y=top10quadgram$frequency))
g4<-g4 + geom_bar(stat = "identity",fill = "purple4")+geom_text(data = top10quadgram,aes(x=top10quadgram$quadgram,y=top10quadgram$frequency,label = top10quadgram$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g4<-g4 + ggtitle("Quadgram Top 10 Frequently Words") + xlab("Quadgrams") + ylab("Frequency")
g4<-g4 + theme(axis.text.x=element_text(angle=45,hjust=1))
g4
pentagram.token <-tokens_ngrams(train.tokens,n=5)
pentagram.token.dfm<-dfm(pentagram.token,tolower = FALSE)
pentagram_freq<-colSums(pentagram.token.dfm)
pentagram_termfreq<-data.frame(pentagram=names(pentagram_freq),frequency=pentagram_freq)
pentagram_termfreq<-arrange(pentagram_termfreq,desc(pentagram_termfreq$frequency))
top10pentagram<-head(pentagram_termfreq,10)
top10pentagram
## pentagram frequency
## 1 at_the_end_of_the 12
## 2 in_the_middle_of_the 9
## 3 for_the_first_time_in 7
## 4 obama_bin_laden_obama_bin 7
## 5 bin_laden_obama_bin_laden 7
## 6 by_the_end_of_the 6
## 7 laden_obama_bin_laden_obama 6
## 8 the_end_of_the_day 5
## 9 i_want_to_bring_the 4
## 10 want_to_bring_the_best 4
Plotting pentagrams
g5<-ggplot(data = top10pentagram,aes(x=reorder(pentagram,-frequency),y=top10pentagram$frequency))
g5<-g5 + geom_bar(stat = "identity",fill = "darkorange4")+geom_text(data = top10pentagram,aes(x=top10pentagram$pentagram,y=top10pentagram$frequency,label = top10pentagram$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g5<-g5 + ggtitle("Pentagram Top 10 Frequently Words") + xlab("Pentagrams") + ylab("Frequency")
g5<-g5 + theme(axis.text.x=element_text(angle=45,hjust=1))
g5