The goal of this milestone is to demonstrate my progress with working with the data, and that I am well on track to making the prediction algorithm. I look forward to hearing your feedback!
library(pryr) # Size Checking 'object_size()'
#Read all of the raw data
blogs <- readLines(con = "./final/en_US/en_US.blogs.txt", encoding= "UTF-8", skipNul = TRUE, n = 899288) #899288 total lines
news <- readLines(con = "./final/en_US/en_US.news.txt", encoding= "UTF-8", skipNul = TRUE, n = 1010240) #1010241 total lines
twitter <- readLines(con = "./final/en_US/en_US.twitter.txt", encoding= "UTF-8", skipNul = TRUE, n = 2360147) #2360147 total lines
library(qdap) # Word/Line Count
object_size(blogs) # Size of Data
sum(lengths(strsplit(blogs, "\\W+"))) # Number of Words
length(blogs) # Number of Lines
object_size(news) # Size of Data
sum(lengths(strsplit(news, "\\W+"))) # Number of Words
length(news) # Number of Lines
object_size(twitter) # Size of Data
sum(lengths(strsplit(twitter, "\\W+"))) # Number of Words
length(twitter) # Number of Lines
The table below gives the results of the Basic file information commands:
| File Name | Size Loaded in R (MB) | Number of Lines | Number of Words |
|---|---|---|---|
| en_US.blogs.txt | 261 | 899288 | 38371704 |
| en_US.news.txt | 262 | 1010240 | 35783000 |
| en_US.twitter.txt | 316 | 2360147 | 31149377 |
sample_percent <- 0.04 #decimal form (20% of 20% should contain 80% of 80% or 64% of the information)
set.seed(72570) #seed from 1 - 10^6 on random.org
blog_s <- sample(blogs, size = ceiling(sample_percent * length(blogs)), replace = FALSE)
news_s <- sample(news, size = ceiling(sample_percent * length(news)), replace = FALSE)
twitter_s <- sample(twitter, size = ceiling(sample_percent * length(twitter)), replace = FALSE)
object_size(blog_s)
## 10.4 MB
object_size(news_s)
## 10.4 MB
object_size(twitter_s)
## 12.8 MB
rm(blogs, news, twitter, sample_percent) #remove the original data sets to save RAM
training <- c(blog_s, news_s, twitter_s)
set.seed(848441)
training <- sample(training) #shuffle the vector randomly
object_size(training)
## 33.6 MB
save(training, file = "training-f.RData")
rm(training, training, twitter_s) #save RAM again
## Warning in rm(training, training, twitter_s): object 'training' not found
rm(list=ls()) #Clear the workspace to save RAM
load('training-f.RData')
library(quanteda)
## Package version: 1.3.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(readtext)
library(ggplot2)
quanteda_options("threads" = 4) # Set Number of Threads
c_training <- corpus(training) # Create corpus
save(c_training, file = "c_training-f.RData")
t_training <- tokens(c_training, remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_twitter = TRUE, remove_hyphens = TRUE)
save(t_training, file = "t_training-f.RData")
n2 <- tokens_ngrams(t_training, n = 2)
n3 <- tokens_ngrams(t_training, n = 3)
n4 <- tokens_ngrams(t_training, n = 4)
save(n2, file = "n2-f.RData")
save(n3, file = "n3-f.RData")
save(n4, file = "n4-f.RData")
d_training <- dfm(t_training)
textstat_frequency(d_training, n = 20) %>%
ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
geom_bar(stat = "identity") +
labs(x = "", y = "Unigram Term Frequency")
textstat_frequency(dfm(n2), n = 10) %>%
ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
geom_bar(stat = "identity") +
labs(x = "Text", y = "Bigram Term Frequency")
textstat_frequency(dfm(n3), n = 10) %>%
ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
geom_bar(stat = "identity") +
labs(x = "Text", y = "Trigram Term Frequency")
textstat_frequency(dfm(n4), n = 5) %>%
ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
geom_bar(stat = "identity") +
labs(x = "Text", y = "Quadgram Term Frequency")