This is my quick preliminary exploration of the tools in R for text mining and analyzing large text samples.

#Capstone Week 2 Milestone Report

library(stringi)
library(tm)
library(ggplot2)
library(RWeka)
library(kableExtra)
library(dplyr)

#Part 1 - Exploratory Analysis of the three files


news <- file("en_US.news.txt", "r")
news_sample <- readLines(news, 2000, encoding = "UTF-8", skipNul = TRUE)
news_lines <- readLines(news, warn = FALSE)
close(news)
blog <- file("en_US.blogs.txt", "r")
blog_sample <- readLines(blog, 2000, encoding = "UTF-8", skipNul = TRUE)
blog_lines <- readLines(blog)
close(blog)
twitter <- file("en_US.twitter.txt", "r")
twit_sample <- readLines(twitter, 2000, encoding = "UTF-8", skipNul = TRUE)
twit_lines <- readLines(twitter, warn = FALSE)
close(twitter)

size <- (sapply(list(file.info("en_US.news.txt")$size,
               file.info("en_US.blogs.txt")$size,
               file.info("en_US.twitter.txt")$size), sum))/1000^2

length <- sapply(list((news_lines),
                      (blog_lines),
                      (twit_lines)), length)

words <- sapply(
        list(news_lines, blog_lines, twit_lines),
        function(x)
        {
                sum(stri_count_words(x))
        }
)

nchar <- sapply(list(news_lines, blog_lines, twit_lines),
        function(x)
                {
                sum(nchar(x))
        })

Summary

This is a quick summary of the text file elements, with size in MB, lines, and total words.

Blogs seem to have the most words per line (WPL), far more than Twitter, which is in line with what we would expect due to Twitter’s character limits.

summary <- data.frame(
        File = c("News", "Blogs", "Twitter"),
        Size = size,
        Lines = length, 
        Words = words,
        Characters = nchar,
        WPL = words/length
)

kable(summary) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
File Size Lines Words Characters WPL
News 205.8119 75259 2623786 15275662 34.86342
Blogs 210.1600 897288 38070821 207907358 42.42876
Twitter 167.1053 2358148 30192647 162247797 12.80354

       

Unigrams and Bigrams

We wrap up with illustrations of the features of the data using simple histograms to display the most common unigrams and bigrams in the data. This includes first cleaning the data from profanity, non-English symbols, and turning it all into lower case.

#Clean up of samples and corpus

blog_sample <- iconv(blog_sample, "latin1", "ASCII", sub = "")
news_sample <- iconv(news_sample, "latin1", "ASCII", sub = "")
twit_sample <- iconv(twit_sample, "latin1", "ASCII", sub = "")

corpus <- VCorpus(VectorSource(c(blog_sample, news_sample, twit_sample)), 
                  readerControl=list(readPlain, language="en", load=TRUE))

url <- "https://raw.githubusercontent.com/RobertJGabriel/Google-profanity-words/master/list.txt"
profanity <- read.delim(url, header = FALSE)


corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords,stopwords("english"))
corpus <- tm_map(corpus, removeWords, profanity$V1)

unigram <- TermDocumentMatrix(corpus)
unigramv <- as.data.frame((as.matrix(unigram)))
unigramv <- sort(rowSums(unigramv), decreasing = TRUE)
unigramd <- data.frame(word = names(unigramv), freq = unigramv)

#Tokenize Functions

bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
ggplot(unigramd [1:25,], aes(x = reorder(word, freq), y = freq))+
        geom_bar(stat = "identity", fill = "#1c65c5", alpha = .8, width = .8)+
        coord_flip()+
        theme_bw()+
        xlab("")+
        ylab("Word Frequency")+
        labs(title = "Unigram - Most Common Words")

bigram <- TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer))
bigramv <- sort(rowSums(as.matrix(removeSparseTerms(bigram, 0.999))), decreasing = TRUE)
bigramd <- data.frame(word = names(bigramv), freq = bigramv)

ggplot(bigramd[1:25,], aes(x = reorder(word, freq), y = freq))+
        geom_bar(stat = "identity", fill = "#D92212", alpha = .8, width = .8)+
        coord_flip()+
        theme_bw()+
        xlab("")+
        ylab("Word Frequency")+
        labs(title = "Bigram - Most Common Words")

Going Forward

Due to time constraints I was not able to incorporate many of the functions available that I would have liked. This is very basic quantitative analysis, but I would love to start working on more qualitative work and see what kind of results I could get from this.

Thank you for reading.