In this write up I wrote a bunch of functions that will analyze text for me. The functions are to:
Unfortunatly, I couldn’t figure out how to use the n-gram method and I feed a bit overwhelmed.
# Load Libraries
library(tm)
## Loading required package: NLP
library(SnowballC)
library(RWeka)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
library(corrplot)
library(magrittr)
# Define Read Sample Lines Data Function
readSampleLines <- function(datafile, numLines) {
linesCtr <- 0
linesRead <- character()
file.con <- file(datafile, "r")
while ((linesCtr < numLines) &&
(length(oneLine <- readLines(file.con, n = 1, warn = FALSE)) > 0)) {
if(rbinom(1, 1, .5)) {
# Remove Non-numeric/character Characters: This will remove non-ascii characters too
linesRead <- c(linesRead, gsub("[^0-9A-Za-z///' ]", "", oneLine))
linesCtr <- linesCtr + 1
}
}
close(file.con)
linesRead
}
# Define Print Lines Functions
printLines <- function(file, line, width=78) {
if(missing(line))
line <- 1:length(file)
for(i in line) {
cat(paste("[[", i, "]] ", sep=""))
writeLines(strwrap(file[[i]], width=width))
}
}
# Define Corpus Buidling and Cleaning Function
buildCleanCorpus <- function(characterVec) {
# Build a Corpus, Specify the Source to be Character Vectors
corpus <- Corpus(VectorSource(characterVec))
# Clean Corpus
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(function(x) gsub("http[[:alnum:]]*", "", x)))
profanity <- read.table("../Data/bad-words.txt", stringsAsFactors = F)
corpus <- tm_map(corpus, removeWords, c(stopwords("english"), profanity[[1]]))
# Stem/Tokenize
#corpus.copy <- corpus
corpus <- tm_map(corpus, stemDocument)
# Stem Completion
#corpus <- tm_map(corpus, stemCompletion, dictionary=corpus.copy)
}
# Define Unigram Frequencies Plot
FreqPlot <- function(dtm, minFreq=100) {
termFrequency <- colSums(as.matrix(dtm))
#termFrequency <- subset(termFrequency, termFrequency>=minFreq)
tf.df <- data.frame(term=names(termFrequency), freq=termFrequency)
subset(tf.df, freq>=minFreq) %>%
ggplot(aes(term, freq)) + geom_bar(stat="identity") + coord_flip()
}
I use the previous functions to read the data then trasforme it to a corpus and plot fequenies. One thing we notice is that the twitter data contains far less frequent words.
# Read Data
root <- "/Users/gabrielm/"
sub.root <- "OneDrive/Documents/HW/Coursera/Data Science Specialization/10 - Capstone Project/"
blogs.data <- readSampleLines("../Data/en_US/en_US.blogs.txt" , 5000)
news.data <- readSampleLines("../Data/en_US/en_US.news.txt" , 5000)
twitter.data <- readSampleLines("../Data/en_US/en_US.twitter.txt", 5000)
# Transform to Clearn Corpus
blogs.corpus <- buildCleanCorpus(blogs.data )
news.corpus <- buildCleanCorpus(news.data )
twitter.corpus <- buildCleanCorpus(twitter.data)
# Convert to Document Term Matrix
blogs.dtm <- DocumentTermMatrix(blogs.corpus )
news.dtm <- DocumentTermMatrix(news.corpus )
twitter.dtm <- DocumentTermMatrix(twitter.corpus)
FreqPlot(blogs.dtm , minFreq = 200)
FreqPlot(news.dtm , minFreq = 200)
FreqPlot(twitter.dtm, minFreq = 200)