Introduction

The goal of this milestone is to demonstrate my progress with working with the data, and that I am well on track to making the prediction algorithm. I look forward to hearing your feedback!

Loading the Raw Data

library(pryr) # Size Checking 'object_size()'

#Read all of the raw data
blogs <- readLines(con = "./final/en_US/en_US.blogs.txt", encoding= "UTF-8", skipNul = TRUE, n  = 899288) #899288 total lines
news <- readLines(con = "./final/en_US/en_US.news.txt", encoding= "UTF-8", skipNul = TRUE, n = 1010240) #1010241 total lines
twitter <- readLines(con = "./final/en_US/en_US.twitter.txt", encoding= "UTF-8", skipNul = TRUE, n = 2360147) #2360147 total lines

Basic File Information

library(qdap) # Word/Line Count

object_size(blogs) # Size of Data
sum(lengths(strsplit(blogs, "\\W+"))) # Number of Words
length(blogs) # Number of Lines

object_size(news) # Size of Data
sum(lengths(strsplit(news, "\\W+"))) # Number of Words
length(news) # Number of Lines

object_size(twitter) # Size of Data
sum(lengths(strsplit(twitter, "\\W+"))) # Number of Words
length(twitter) # Number of Lines

The table below gives the results of the Basic file information commands:

File Name Size Loaded in R (MB) Number of Lines Number of Words
en_US.blogs.txt 261 899288 38371704
en_US.news.txt 262 1010240 35783000
en_US.twitter.txt 316 2360147 31149377

Sample 4% of the entire data set for each data set

sample_percent <- 0.04 #decimal form (20% of 20% should contain 80% of 80% or 64% of the information)

set.seed(72570) #seed from 1 - 10^6 on random.org
blog_s <- sample(blogs, size = ceiling(sample_percent * length(blogs)), replace = FALSE)
news_s <- sample(news, size = ceiling(sample_percent * length(news)), replace = FALSE)
twitter_s <- sample(twitter, size = ceiling(sample_percent * length(twitter)), replace = FALSE)

object_size(blog_s)
## 10.4 MB
object_size(news_s)
## 10.4 MB
object_size(twitter_s)
## 12.8 MB
rm(blogs, news, twitter, sample_percent) #remove the original data sets to save RAM

Concatenate all data sets into a single set (training)

training <- c(blog_s, news_s, twitter_s)

set.seed(848441)
training <- sample(training) #shuffle the vector randomly

object_size(training)
## 33.6 MB

Export Data for Later Analysis

save(training, file = "training-f.RData")
rm(training, training, twitter_s) #save RAM again
## Warning in rm(training, training, twitter_s): object 'training' not found

Clear Workspace and Load ‘quanteda’ Package

rm(list=ls()) #Clear the workspace to save RAM

load('training-f.RData')

library(quanteda)
## Package version: 1.3.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(readtext)
library(ggplot2)

quanteda_options("threads" = 4) # Set Number of Threads

Create the ‘training’ corpus ‘c_training’

c_training <- corpus(training) # Create corpus
save(c_training, file = "c_training-f.RData")

Tokenize, clean data as well

t_training <- tokens(c_training, remove_numbers = TRUE, remove_punct = TRUE, 
                     remove_symbols = TRUE, remove_twitter = TRUE, remove_hyphens = TRUE)
save(t_training, file = "t_training-f.RData")

Generate n-grams (2, 3, and 4)

n2 <- tokens_ngrams(t_training, n = 2)
n3 <- tokens_ngrams(t_training, n = 3)
n4 <- tokens_ngrams(t_training, n = 4)

save(n2, file = "n2-f.RData")
save(n3, file = "n3-f.RData")
save(n4, file = "n4-f.RData")

Generate DFM (Document-Feature Matrix)

d_training <- dfm(t_training)

Plot Frequency of Unigrams, Bigrams, Trigrams, and Quadgrams

textstat_frequency(d_training, n = 20) %>% 
  ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
  geom_bar(stat = "identity") + 
  labs(x = "", y = "Unigram Term Frequency")

textstat_frequency(dfm(n2), n = 10) %>% 
  ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
  geom_bar(stat = "identity") + 
  labs(x = "Text", y = "Bigram Term Frequency")

textstat_frequency(dfm(n3), n = 10) %>% 
  ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
  geom_bar(stat = "identity") + 
  labs(x = "Text", y = "Trigram Term Frequency")

textstat_frequency(dfm(n4), n = 5) %>% 
  ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
  geom_bar(stat = "identity") + 
  labs(x = "Text", y = "Quadgram Term Frequency")

Next Steps