Introduction

This project is created as part of the capstone project of the Data Science specialization. This report focuses on the exploratory data analysis of the input data files.

The below analysis shows file sizes, number of words/lines per file. Also shows unigram, bigram and trigram models, plots top20 words from each models and finally shows how many unique words cover 50% & 90% of the total words of sample data.

Data source

The data is from a corpus called HC Corpora (www.corpora.heliohost.org). See the readme file at http://www.corpora.heliohost.org/aboutcorpus.html for details on the corpora available.

Reading the data.

Below loading the data and finding out various attributes of the data such as number of lines per file, size of the file, number of words in each file etc.

Dataset FileSize Lines TotalWords
en_US.blogs.txt 210 MB 899288 208361.4K
en_US.news.txt 206 MB 77259 15683.8K
en_US.twitter.txt 167 MB 2360148 162385K

Data preparation

I have read three different data files from three sources. Due to limitations in processing power, I have used sample of only 2,000 lines from each data set. Cleaned the data by removing the numbers, stopwords, whitespaces and punctuations. And finally all words are converted to lower case.

For profanity filtering, I have used a “badwords” list from (source: https://github.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en).

    badwordslines<-readLines(con <- file(badwords,"r"),skipNul=TRUE)
    close(con)
    
    .contract <- function(contract) {
        contract <- gsub("won't", "will not", contract)
        contract <- gsub("n't", " not", contract)
        contract <- gsub("'ll", " will", contract)
        contract <- gsub("'re", " are", contract)
        contract <- gsub("'ve", " have", contract)
        contract <- gsub("'m", " am", contract)
        contract <- gsub("'s", "", contract)
        return(contract)
    }
    # sample and process corpora
    SPC<-function(x) { 
      x<-VCorpus(VectorSource(x))
      x<-tm_map(x,content_transformer(gsub),pattern="won't",replace="will not")
      x<-tm_map(x,content_transformer(gsub),pattern="n't",replace=" not")
      x<-tm_map(x,content_transformer(gsub),pattern="'ll",replace=" will")
      x<-tm_map(x,content_transformer(gsub),pattern="'re",replace=" are")
      x<-tm_map(x,content_transformer(gsub),pattern="'ve",replace=" have")
      x<-tm_map(x,content_transformer(gsub),pattern="'m",replace=" am")
      x<-tm_map(x,content_transformer(gsub),pattern="'s",replace="")
      x<-tm_map(x,removeNumbers)
      x<-tm_map(x,removeWords, stopwords("english"))
      x<-tm_map(x,removeWords, badwordslines)
      x<-tm_map(x,stripWhitespace)
      x<-tm_map(x, content_transformer(tolower))
      x<-tm_map(x, removePunctuation,preserve_intra_word_dashes = TRUE)
      
    }

    sampleSize <- 2000
    sample_blogs <- readLines(blogfile, n = sampleSize)
    sample_news <- readLines(newsfile, n = sampleSize)
    sample_twitter <- readLines(twitfile, n = sampleSize)
    
    samples <- c(sample_blogs, sample_news, sample_twitter)
    
    # find indices of words with non-ASCII characters, remove words with non-ASCII characters
    dat <- grep("corpus", iconv(samples, "latin1", "ASCII", sub="corpus"))
    # subset original vector of words to exclude words with non-ASCII char
    samples <- samples[-dat]
    # convert vector back to a string
    samples <- paste(samples, collapse = ", ")
    
    samples_a<-SPC(samples); 

Tokenization

The following code generates n-gram tokens from the corpora using the RWeka package and creates unigrams, bigrams and trigrams for the dataset.

    UniToken<-function(x)NGramTokenizer(x, Weka_control(min = 1, max = 1))
    BiToken<-function(x)NGramTokenizer(x, Weka_control(min = 2, max = 2))
    TriToken<-function(x)NGramTokenizer(x, Weka_control(min = 3, max = 3))
    

    myDtm <- TermDocumentMatrix(samples_a, control = list(minWordLength = 1))
    #findFreqTerms(myDtm, lowfreq=100)

    uni.spa <- TermDocumentMatrix(samples_a, control = list(tokenize = UniToken))
    uni <- findFreqTerms(uni.spa)
    uni.sort <- sort(rowSums(as.matrix(uni.spa[uni,])),decreasing=TRUE)
    uni.freq <- data.frame(word=names(uni.sort),frequency=uni.sort) 
    ggplot(uni.freq[1:20,], aes(factor(word, levels=unique(word)),frequency, fill=frequency)) +
      geom_bar(stat = 'identity') +
      theme(axis.text.x=element_text(angle=90)) +
      labs(title="Unigram") +
        xlab('Top20 Words') +
      ylab('Frequency')    

    m <- as.matrix(myDtm)
    # calculate the frequency of words
    v <- sort(rowSums(m), decreasing=TRUE)
    myNames <- names(v)
    head(myNames)
## [1] "the"  "will" "said" "one"  "just" "like"
    d <- data.frame(word=myNames, freq=v)
    wordcloud(d$word, d$freq, max.words =100,colors=brewer.pal(8, "Dark2"))

    # bigram wordcloud code
    tdm.bigram <- TermDocumentMatrix(samples_a, control = list(tokenize = BiToken))

        # Try removing sparse terms at a few different levels
    tdm99.bigram  <- removeSparseTerms(tdm.bigram, 0.99)
    
    bi<- findFreqTerms(tdm99.bigram)
    bi.sort <- sort(rowSums(as.matrix(tdm99.bigram[bi,])),decreasing=TRUE)
    bi.freq <- data.frame(word=names(bi.sort),frequency=bi.sort) 
    ggplot(bi.freq[1:20,], aes(factor(word, levels=unique(word)),frequency, fill=frequency)) +
      geom_bar(stat = 'identity') +
      theme(axis.text.x=element_text(angle=90)) +
      labs(title="Bigram") +
        xlab('Top20 Words') +
      ylab('Frequency') 

    m2 = as.matrix(tdm99.bigram)
    v2 = sort(rowSums(m2),decreasing=TRUE)
    d2 = data.frame(word = names(v2),freq=v2)
    #str(d2)
    
    # Create the word cloud
    # pal = brewer.pal(9,"BuPu")
    # wordcloud(words = d2$word,
    #           freq = d2$freq, scale = c(3,.8),
    #           random.order = F,
    #           relative_scaling= 0, prefer_horizontal = 0.6,
    #           rot.per=0.35,
    #           colors = pal,
    #           max.words =40 )
    wordcloud(d2$word, d2$freq, min.freq = 1, max.words =30,scale = c(3.5,0.2), colors=brewer.pal(8, "Dark2"))

    # trigram wordcloud code
    tdm.trigram <- TermDocumentMatrix(samples_a, control = list(tokenize = TriToken))

    # Try removing sparse terms at a few different levels
    tdm999.trigram  <- removeSparseTerms(tdm.trigram, 0.999)
    
    tri<- findFreqTerms(tdm999.trigram)
    tri.sort <- sort(rowSums(as.matrix(tdm999.trigram[tri,])),decreasing=TRUE)
    tri.freq <- data.frame(word=names(tri.sort),frequency=tri.sort) 
    ggplot(tri.freq[1:20,], aes(factor(word, levels=unique(word)),frequency, fill=frequency)) +
      geom_bar(stat = 'identity') +
      theme(axis.text.x=element_text(angle=90)) +
      labs(title="Trigram") +
        xlab('Top20 Words') +
      ylab('Frequency') 

    m3 = as.matrix(tdm999.trigram)
    v3 = sort(rowSums(m3),decreasing=TRUE)
    d3 = data.frame(word = names(v3),freq=v3)

    wordcloud(d3$word, d3$freq,min.freq = 1, max.words =30,scale = c(3.0,.25),colors=brewer.pal(8, "Dark2"))

Unique Word Coverage

    unigrams<-function(x)
      {tdm <- TermDocumentMatrix(x, control = list(tokenize = UniToken))
       fm <- rowSums(as.matrix(tdm))
       ngram<-data.frame(ngram=names(fm),freq=fm)
       ngram<-ngram[order(-ngram$freq),]
    }

    full_a1<-unigrams(samples_a)
    
    wordcoverage<-function(x,wordcover) #x is the unigram output sorted by frequency, y is the percent word coverage
      {nwords<-0 # initial counter
      coverage<-wordcover*sum(x$freq) # number of words to hit coverage
      for (i in 1:nrow(x))
        {if (nwords >= coverage) {return (i)}
        nwords<-nwords+x$freq[i]
      }}

    dictionary.size.to.coverage.uni <- cumsum(full_a1$freq * 100 / sum(full_a1$freq))
    fiftypercentcoverage<-wordcoverage(full_a1,0.5)
    
    fiftypercentcoverage<-wordcoverage(full_a1,0.5)
    fiftypercentcoverage
## [1] 924
    ninetypercentcoverage<-wordcoverage(full_a1,0.9)
    ninetypercentcoverage
## [1] 9589
    #Total words in the corpus
    totalwordsincorpus<-sum(full_a1$freq)
    totalwordsincorpus
## [1] 70247
    plot(x=1:length(dictionary.size.to.coverage.uni),
      y=dictionary.size.to.coverage.uni,
      type="l", main="Unigram Coverage", 
      xlab="Dictionary Size (words)", 
      ylab="Coverage (percent)")

    abline (h=50, v= fiftypercentcoverage,col= "blue")
    points.default(x=fiftypercentcoverage, y=50, type="p", pch=22, col="black", bg=NA, cex=1.)
    abline (h=90, v= ninetypercentcoverage,col= "red")
    points.default(x=ninetypercentcoverage, y=90, type="p", pch=22, col="black", bg=NA, cex=1.)    
    legend(13000, 40, legend=c("90% coverage", "50% coverage"),
       col=c("red", "blue"), lty=1:1, cex=0.8)
    
    text(0,55, "50% coverage",pos=4)
    text(fiftypercentcoverage,50, as.character(fiftypercentcoverage),pos=1)
    
    text(0,90, "90% coverage",pos=4)
    text(ninetypercentcoverage,90, as.character(ninetypercentcoverage),pos=1)

    text(7500,15, "Total words in corpus",pos=3,,col = "orange")
    text(7500,15, as.character(totalwordsincorpus),pos=1,col = "orange")

Observations & Plans

Initially I faced a lot of challenges and wasted a lot of time due to large sample size I chose, but in the end I learnt from it.

The above report does show some basic exploratory analysis. The next steps of this capstone project would be to develop a predictive algorithm that would be used in a Shiny app.

I have not studied yet how I am going to make the prediction of next word challenge for the shiny app, but I can guess that i can use bigram, trigrams models to a larger extent, but I am sure that would not be enough.