Milestone Report

The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.

Tasks to accomplish:

Exploratory analysis:

perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.

R setup

setwd("~/Data Science/Jhon Hopkins/Capstone/Week2")
library(ggplot2)
library(quanteda)

## Warning: package 'quanteda' was built under R version 3.5.2

## Package version: 1.3.14

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

library(data.table)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

File basic information

The following app and report will be built using only english textdata therefore we will select data from twitter blogs and news only in english lenguage

File size

list.files()

##  [1] "dpb-colloc01.pdf"                      
##  [2] "en_US.blogs.txt"                       
##  [3] "en_US.news.txt"                        
##  [4] "en_US.twitter.txt"                     
##  [5] "kbot_complete_hand_written_example.pdf"
##  [6] "Milestone.html"                        
##  [7] "Milestone.Rmd"                         
##  [8] "Milestone_cache"                       
##  [9] "Milestone_files"                       
## [10] "muestra.txt"                           
## [11] "W2milestone.R"

file.info("en_US.blogs.txt")$size/1024^2

## [1] 200.4242

file.info("en_US.news.txt")$size/1024^2

## [1] 196.2775

file.info("en_US.twitter.txt")$size/1024^2

## [1] 159.3641

The average file size is in the order of 200Mb which is certainly not a trivial size

Read and upload the data

blogs<-readLines("en_US.blogs.txt",skipNul = TRUE, warn = TRUE)
news<-readLines("en_US.news.txt",skipNul = TRUE, warn = TRUE)

## Warning in readLines("en_US.news.txt", skipNul = TRUE, warn = TRUE):
## incomplete final line found on 'en_US.news.txt'

twitter<-readLines("en_US.twitter.txt",skipNul = TRUE,warn = TRUE)

Number of lines

length(blogs)

## [1] 899288

length(news)

## [1] 77259

length(twitter)

## [1] 2360148

Max number of characters per line

max(nchar(blogs))

## [1] 40835

max(nchar(news))

## [1] 5760

max(nchar(twitter))

## [1] 213

Mean number of characters per line

mean(nchar(blogs))

## [1] 231.696

mean(nchar(news))

## [1] 203.0024

mean(nchar(twitter))

## [1] 68.8029

Summary of nchar

summary(nchar(blogs))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    47.0   157.0   231.7   331.0 40835.0

summary(nchar(news))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2     111     186     203     270    5760

summary(nchar(twitter))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0    37.0    64.0    68.8   100.0   213.0

Data sample unification

The data is to massive to process it completely specially if we consider constrains related to mobile cellphone memory therefore just a sample will be used for the models generation

set.seed(3007)
sam_blogs<-sample(blogs,size = 3000,replace = TRUE)
sam_news<-sample(news,size = 3000,replace = TRUE)
sam_twitter<-sample(twitter,size = 3000,replace = TRUE)
muestra<-c(sam_blogs,sam_news,sam_twitter)

writeLines(muestra,"muestra.txt")

rm(blogs,twitter,news)
rm(sam_blogs,sam_news,sam_twitter)

muestra<-as.data.frame(muestra)
names(muestra)<-c("text")
muestra$text<-as.character(muestra$text)
nrow(muestra)

## [1] 9000

It is important to verify if there are missing values on the data set, specially after unification

length(which(!complete.cases(muestra)))

## [1] 0

We can confirm that the sample has no missing values and therefore we are good to proceed forward

Unigram Creation

For most of the cleaning and tokenization process we are going to use the quanteda package functions

train.tokens<-tokens(muestra$text,what="word",remove_numbers = TRUE, remove_punct = TRUE,
                     remove_symbols = TRUE, remove_separators = TRUE,
                     remove_twitter = TRUE, remove_hyphens = TRUE, remove_url = TRUE)

train.tokens<-tokens_tolower(train.tokens)

train.tokens.dfm <- dfm(train.tokens,tolower = FALSE)

text data is already tokenized what we need to do next is to calculate the frequency of each of the token words

unigram_freq<-colSums(train.tokens.dfm)
termFreq <- data.frame(unigram=names(unigram_freq), frequency=unigram_freq)

termFreq <-arrange(termFreq,desc(termFreq$frequency))
top10<-head(termFreq,10)
top10

##    unigram frequency
## 1      the     13267
## 2       to      7255
## 3      and      6659
## 4        a      6256
## 5       of      5774
## 6       in      4565
## 7        i      3876
## 8     that      2803
## 9       is      2787
## 10     for      2760

Plotting the unigram

g1<-ggplot(data = top10,aes(x=reorder(unigram,-frequency),y=top10$frequency))
g1<-g1 + geom_bar(stat = "identity",fill = "steelblue")+geom_text(data = top10,aes(x=top10$unigram,y=top10$frequency,label = top10$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g1<-g1 + ggtitle("Unigram Top 10 Frequently Words") + xlab("Unigrams") + ylab("Frequency")
g1

Bigram Creation

To create n>1 grams we will use the quanteda function tokens_ngrams

bigram.token <-tokens_ngrams(train.tokens,n=2)
bigram.token.dfm <- dfm(bigram.token,tolower = FALSE)

bigram_freq<-colSums(bigram.token.dfm)
bigram_termfreq<-data.frame(bigram=names(bigram_freq),frequency=bigram_freq)

bigram_termfreq<-arrange(bigram_termfreq,desc(bigram_termfreq$frequency))
top10bigram<-head(bigram_termfreq,10)
top10bigram

##      bigram frequency
## 1    of_the      1259
## 2    in_the      1172
## 3    to_the       599
## 4    on_the       513
## 5   for_the       496
## 6     to_be       423
## 7    at_the       374
## 8      in_a       362
## 9   and_the       354
## 10 with_the       320

Plotting the bigram

g2<-ggplot(data = top10bigram,aes(x=reorder(bigram,-frequency),y=top10bigram$frequency))
g2<-g2 + geom_bar(stat = "identity",fill = "seagreen4")+geom_text(data = top10bigram,aes(x=top10bigram$bigram,y=top10bigram$frequency,label = top10bigram$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g2<-g2 + ggtitle("Bigram Top 10 Frequently Words") + xlab("Bigrams") + ylab("Frequency")
g2

Trigram Creation

trigram.token <-tokens_ngrams(train.tokens,n=3)
trigram.token.dfm<-dfm(trigram.token,tolower = FALSE)

trigram_freq<-colSums(trigram.token.dfm)
trigram_termfreq<-data.frame(trigram=names(trigram_freq),frequency=trigram_freq)

trigram_termfreq<-arrange(trigram_termfreq,desc(trigram_termfreq$frequency))
top10trigram<-head(trigram_termfreq,10)

Plotting trigrams

g3<-ggplot(data = top10trigram,aes(x=reorder(trigram,-frequency),y=top10trigram$frequency))
g3<-g3 + geom_bar(stat = "identity",fill = "red4")+geom_text(data = top10trigram,aes(x=top10trigram$trigram,y=top10trigram$frequency,label = top10trigram$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g3<-g3 + ggtitle("Trigram Top 10 Frequently Words") + xlab("Trigrams") + ylab("Frequency")
g3

Quadgrams creation

quadgram.token <-tokens_ngrams(train.tokens,n=4)
quadgram.token.dfm<-dfm(quadgram.token,tolower=FALSE)

quadgram_freq<-colSums(quadgram.token.dfm)
quadgram_termfreq<-data.frame(quadgram=names(quadgram_freq),frequency=quadgram_freq)
quadgram_termfreq<-arrange(quadgram_termfreq,desc(quadgram_termfreq$frequency))

top10quadgram<-head(quadgram_termfreq,10)

Plotting Quadgrams

g4<-ggplot(data = top10quadgram,aes(x=reorder(quadgram,-frequency),y=top10quadgram$frequency))
g4<-g4 + geom_bar(stat = "identity",fill = "purple4")+geom_text(data = top10quadgram,aes(x=top10quadgram$quadgram,y=top10quadgram$frequency,label = top10quadgram$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g4<-g4 + ggtitle("Quadgram Top 10 Frequently Words") + xlab("Quadgrams") + ylab("Frequency")
g4<-g4 + theme(axis.text.x=element_text(angle=45,hjust=1))
g4

Pentagrams creation

pentagram.token <-tokens_ngrams(train.tokens,n=5)
pentagram.token.dfm<-dfm(pentagram.token,tolower = FALSE)

pentagram_freq<-colSums(pentagram.token.dfm)
pentagram_termfreq<-data.frame(pentagram=names(pentagram_freq),frequency=pentagram_freq)
pentagram_termfreq<-arrange(pentagram_termfreq,desc(pentagram_termfreq$frequency))

top10pentagram<-head(pentagram_termfreq,10)
top10pentagram

##                      pentagram frequency
## 1            at_the_end_of_the        12
## 2         in_the_middle_of_the         9
## 3        for_the_first_time_in         7
## 4    obama_bin_laden_obama_bin         7
## 5    bin_laden_obama_bin_laden         7
## 6            by_the_end_of_the         6
## 7  laden_obama_bin_laden_obama         6
## 8           the_end_of_the_day         5
## 9          i_want_to_bring_the         4
## 10      want_to_bring_the_best         4

Plotting pentagrams

g5<-ggplot(data = top10pentagram,aes(x=reorder(pentagram,-frequency),y=top10pentagram$frequency))
g5<-g5 + geom_bar(stat = "identity",fill = "darkorange4")+geom_text(data = top10pentagram,aes(x=top10pentagram$pentagram,y=top10pentagram$frequency,label = top10pentagram$frequency),hjust=0.5,vjust=2, position = "identity",size = 5,color = "White")
g5<-g5 + ggtitle("Pentagram Top 10 Frequently Words") + xlab("Pentagrams") + ylab("Frequency")
g5<-g5 + theme(axis.text.x=element_text(angle=45,hjust=1))
g5

Milestoneproject

Luis Fernando Perez Armas

January 7, 2019