In this write up I wrote a bunch of functions that will analyze text for me. The functions are to:

  1. Read data that is sampled
  2. Build a clean corpus
  3. Plot fequencies of words

Unfortunatly, I couldn’t figure out how to use the n-gram method and I feed a bit overwhelmed.

Function Definitions

# Load Libraries
library(tm)
## Loading required package: NLP
library(SnowballC)
library(RWeka)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
library(corrplot)
library(magrittr)

# Define Read Sample Lines Data Function
readSampleLines <- function(datafile, numLines) {
    linesCtr  <- 0
    linesRead <- character()
    
    file.con <- file(datafile, "r")
    while ((linesCtr < numLines) && 
               (length(oneLine <- readLines(file.con, n = 1, warn = FALSE)) > 0)) {
        if(rbinom(1, 1, .5)) {
            # Remove Non-numeric/character Characters: This will remove non-ascii characters too
            linesRead <- c(linesRead, gsub("[^0-9A-Za-z///' ]", "", oneLine))
            linesCtr <- linesCtr + 1
        }
    }    
    close(file.con)
    
    linesRead
}

# Define Print Lines Functions
printLines <- function(file, line, width=78) {
    if(missing(line)) 
        line <- 1:length(file)
    
    for(i in line) {
        cat(paste("[[", i, "]] ", sep=""))
        writeLines(strwrap(file[[i]], width=width))
    }
}

# Define Corpus Buidling and Cleaning Function
buildCleanCorpus <- function(characterVec) {
    # Build a Corpus, Specify the Source to be Character Vectors
    corpus <- Corpus(VectorSource(characterVec))
    
    # Clean Corpus
    corpus <- tm_map(corpus, stripWhitespace)
    corpus <- tm_map(corpus, removeNumbers)
    corpus <- tm_map(corpus, removePunctuation)
    corpus <- tm_map(corpus, content_transformer(tolower))
    corpus <- tm_map(corpus, content_transformer(function(x) gsub("http[[:alnum:]]*", "", x)))
    
    profanity <- read.table("../Data/bad-words.txt", stringsAsFactors = F)
    corpus <- tm_map(corpus, removeWords, c(stopwords("english"), profanity[[1]]))
    
    # Stem/Tokenize
    #corpus.copy <- corpus
    corpus <- tm_map(corpus, stemDocument)
    
    # Stem Completion
    #corpus <- tm_map(corpus, stemCompletion, dictionary=corpus.copy)
}

# Define Unigram Frequencies Plot
FreqPlot <- function(dtm, minFreq=100) {
    termFrequency <- colSums(as.matrix(dtm))
    #termFrequency <- subset(termFrequency, termFrequency>=minFreq)
    tf.df <- data.frame(term=names(termFrequency), freq=termFrequency)
    
    subset(tf.df, freq>=minFreq) %>% 
        ggplot(aes(term, freq)) + geom_bar(stat="identity") + coord_flip()
}

Data Reading & Processing

I use the previous functions to read the data then trasforme it to a corpus and plot fequenies. One thing we notice is that the twitter data contains far less frequent words.

# Read Data
root      <- "/Users/gabrielm/"
sub.root  <- "OneDrive/Documents/HW/Coursera/Data Science Specialization/10 - Capstone Project/"

blogs.data   <- readSampleLines("../Data/en_US/en_US.blogs.txt"  , 5000)
news.data    <- readSampleLines("../Data/en_US/en_US.news.txt"   , 5000)
twitter.data <- readSampleLines("../Data/en_US/en_US.twitter.txt", 5000)

# Transform to Clearn Corpus
blogs.corpus   <- buildCleanCorpus(blogs.data  )
news.corpus    <- buildCleanCorpus(news.data   )
twitter.corpus <- buildCleanCorpus(twitter.data)

# Convert to Document Term Matrix
blogs.dtm   <- DocumentTermMatrix(blogs.corpus  )
news.dtm    <- DocumentTermMatrix(news.corpus   )
twitter.dtm <- DocumentTermMatrix(twitter.corpus)

Terms Frequencies

FreqPlot(blogs.dtm  , minFreq = 200)

FreqPlot(news.dtm   , minFreq = 200)

FreqPlot(twitter.dtm, minFreq = 200)