This is my quick preliminary exploration of the tools in R for text mining and analyzing large text samples.
#Capstone Week 2 Milestone Report
library(stringi)
library(tm)
library(ggplot2)
library(RWeka)
library(kableExtra)
library(dplyr)
#Part 1 - Exploratory Analysis of the three files
news <- file("en_US.news.txt", "r")
news_sample <- readLines(news, 2000, encoding = "UTF-8", skipNul = TRUE)
news_lines <- readLines(news, warn = FALSE)
close(news)
blog <- file("en_US.blogs.txt", "r")
blog_sample <- readLines(blog, 2000, encoding = "UTF-8", skipNul = TRUE)
blog_lines <- readLines(blog)
close(blog)
twitter <- file("en_US.twitter.txt", "r")
twit_sample <- readLines(twitter, 2000, encoding = "UTF-8", skipNul = TRUE)
twit_lines <- readLines(twitter, warn = FALSE)
close(twitter)
size <- (sapply(list(file.info("en_US.news.txt")$size,
file.info("en_US.blogs.txt")$size,
file.info("en_US.twitter.txt")$size), sum))/1000^2
length <- sapply(list((news_lines),
(blog_lines),
(twit_lines)), length)
words <- sapply(
list(news_lines, blog_lines, twit_lines),
function(x)
{
sum(stri_count_words(x))
}
)
nchar <- sapply(list(news_lines, blog_lines, twit_lines),
function(x)
{
sum(nchar(x))
})
This is a quick summary of the text file elements, with size in MB, lines, and total words.
Blogs seem to have the most words per line (WPL), far more than Twitter, which is in line with what we would expect due to Twitter’s character limits.
summary <- data.frame(
File = c("News", "Blogs", "Twitter"),
Size = size,
Lines = length,
Words = words,
Characters = nchar,
WPL = words/length
)
kable(summary) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| File | Size | Lines | Words | Characters | WPL |
|---|---|---|---|---|---|
| News | 205.8119 | 75259 | 2623786 | 15275662 | 34.86342 |
| Blogs | 210.1600 | 897288 | 38070821 | 207907358 | 42.42876 |
| 167.1053 | 2358148 | 30192647 | 162247797 | 12.80354 |
We wrap up with illustrations of the features of the data using simple histograms to display the most common unigrams and bigrams in the data. This includes first cleaning the data from profanity, non-English symbols, and turning it all into lower case.
#Clean up of samples and corpus
blog_sample <- iconv(blog_sample, "latin1", "ASCII", sub = "")
news_sample <- iconv(news_sample, "latin1", "ASCII", sub = "")
twit_sample <- iconv(twit_sample, "latin1", "ASCII", sub = "")
corpus <- VCorpus(VectorSource(c(blog_sample, news_sample, twit_sample)),
readerControl=list(readPlain, language="en", load=TRUE))
url <- "https://raw.githubusercontent.com/RobertJGabriel/Google-profanity-words/master/list.txt"
profanity <- read.delim(url, header = FALSE)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords,stopwords("english"))
corpus <- tm_map(corpus, removeWords, profanity$V1)
unigram <- TermDocumentMatrix(corpus)
unigramv <- as.data.frame((as.matrix(unigram)))
unigramv <- sort(rowSums(unigramv), decreasing = TRUE)
unigramd <- data.frame(word = names(unigramv), freq = unigramv)
#Tokenize Functions
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
ggplot(unigramd [1:25,], aes(x = reorder(word, freq), y = freq))+
geom_bar(stat = "identity", fill = "#1c65c5", alpha = .8, width = .8)+
coord_flip()+
theme_bw()+
xlab("")+
ylab("Word Frequency")+
labs(title = "Unigram - Most Common Words")
bigram <- TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer))
bigramv <- sort(rowSums(as.matrix(removeSparseTerms(bigram, 0.999))), decreasing = TRUE)
bigramd <- data.frame(word = names(bigramv), freq = bigramv)
ggplot(bigramd[1:25,], aes(x = reorder(word, freq), y = freq))+
geom_bar(stat = "identity", fill = "#D92212", alpha = .8, width = .8)+
coord_flip()+
theme_bw()+
xlab("")+
ylab("Word Frequency")+
labs(title = "Bigram - Most Common Words")
Due to time constraints I was not able to incorporate many of the functions available that I would have liked. This is very basic quantitative analysis, but I would love to start working on more qualitative work and see what kind of results I could get from this.
Thank you for reading.