Capstone: Week 2 Milestone Report

This is my quick preliminary exploration of the tools in R for text mining and analyzing large text samples.

#Capstone Week 2 Milestone Report

library(stringi)
library(tm)
library(ggplot2)
library(RWeka)
library(kableExtra)
library(dplyr)

#Part 1 - Exploratory Analysis of the three files


news <- file("en_US.news.txt", "r")
news_sample <- readLines(news, 2000, encoding = "UTF-8", skipNul = TRUE)
news_lines <- readLines(news, warn = FALSE)
close(news)
blog <- file("en_US.blogs.txt", "r")
blog_sample <- readLines(blog, 2000, encoding = "UTF-8", skipNul = TRUE)
blog_lines <- readLines(blog)
close(blog)
twitter <- file("en_US.twitter.txt", "r")
twit_sample <- readLines(twitter, 2000, encoding = "UTF-8", skipNul = TRUE)
twit_lines <- readLines(twitter, warn = FALSE)
close(twitter)

size <- (sapply(list(file.info("en_US.news.txt")$size,
               file.info("en_US.blogs.txt")$size,
               file.info("en_US.twitter.txt")$size), sum))/1000^2

length <- sapply(list((news_lines),
                      (blog_lines),
                      (twit_lines)), length)

words <- sapply(
        list(news_lines, blog_lines, twit_lines),
        function(x)
        {
                sum(stri_count_words(x))
        }
)

nchar <- sapply(list(news_lines, blog_lines, twit_lines),
        function(x)
                {
                sum(nchar(x))
        })

Summary

This is a quick summary of the text file elements, with size in MB, lines, and total words.

Blogs seem to have the most words per line (WPL), far more than Twitter, which is in line with what we would expect due to Twitter’s character limits.

summary <- data.frame(
        File = c("News", "Blogs", "Twitter"),
        Size = size,
        Lines = length, 
        Words = words,
        Characters = nchar,
        WPL = words/length
)

kable(summary) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

File	Size	Lines	Words	Characters	WPL
News	205.8119	75259	2623786	15275662	34.86342
Blogs	210.1600	897288	38070821	207907358	42.42876
Twitter	167.1053	2358148	30192647	162247797	12.80354

Unigrams and Bigrams

We wrap up with illustrations of the features of the data using simple histograms to display the most common unigrams and bigrams in the data. This includes first cleaning the data from profanity, non-English symbols, and turning it all into lower case.

#Clean up of samples and corpus

blog_sample <- iconv(blog_sample, "latin1", "ASCII", sub = "")
news_sample <- iconv(news_sample, "latin1", "ASCII", sub = "")
twit_sample <- iconv(twit_sample, "latin1", "ASCII", sub = "")

corpus <- VCorpus(VectorSource(c(blog_sample, news_sample, twit_sample)), 
                  readerControl=list(readPlain, language="en", load=TRUE))

url <- "https://raw.githubusercontent.com/RobertJGabriel/Google-profanity-words/master/list.txt"
profanity <- read.delim(url, header = FALSE)


corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords,stopwords("english"))
corpus <- tm_map(corpus, removeWords, profanity$V1)

unigram <- TermDocumentMatrix(corpus)
unigramv <- as.data.frame((as.matrix(unigram)))
unigramv <- sort(rowSums(unigramv), decreasing = TRUE)
unigramd <- data.frame(word = names(unigramv), freq = unigramv)

#Tokenize Functions

bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))

ggplot(unigramd [1:25,], aes(x = reorder(word, freq), y = freq))+
        geom_bar(stat = "identity", fill = "#1c65c5", alpha = .8, width = .8)+
        coord_flip()+
        theme_bw()+
        xlab("")+
        ylab("Word Frequency")+
        labs(title = "Unigram - Most Common Words")

bigram <- TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer))
bigramv <- sort(rowSums(as.matrix(removeSparseTerms(bigram, 0.999))), decreasing = TRUE)
bigramd <- data.frame(word = names(bigramv), freq = bigramv)

ggplot(bigramd[1:25,], aes(x = reorder(word, freq), y = freq))+
        geom_bar(stat = "identity", fill = "#D92212", alpha = .8, width = .8)+
        coord_flip()+
        theme_bw()+
        xlab("")+
        ylab("Word Frequency")+
        labs(title = "Bigram - Most Common Words")

Going Forward

Due to time constraints I was not able to incorporate many of the functions available that I would have liked. This is very basic quantitative analysis, but I would love to start working on more qualitative work and see what kind of results I could get from this.

Thank you for reading.

Capstone: Week 2 Milestone Report

JRP

6/8/2020

Summary

Unigrams and Bigrams

Going Forward