Exploring data Size

Before doing further with our data set, it is recommended to check the imported of the object size. This is because when we are working with NLP, we will work with corpora that has big size.

# importing library
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggplot2)
library(stringr)
library(tidytext)

# save path for each data
blog_path <- "final/en_US/en_US.blogs.txt"
news_path <- "final/en_US/en_US.news.txt"
twitter_path <- "final/en_US/en_US.twitter.txt"
all_object_path <- c(blog_path = blog_path, news_path = news_path, twitter_path = twitter_path)

a <- readLines(blog_path)
b <- readLines(news_path)
c <- readLines(twitter_path)
## Warning in readLines(twitter_path): line 167155 appears to contain an embedded
## nul
## Warning in readLines(twitter_path): line 268547 appears to contain an embedded
## nul
## Warning in readLines(twitter_path): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(twitter_path): line 1759032 appears to contain an embedded
## nul
# explore the size of each object when imported
imported_object_size <- tribble(~category, ~object_size,
                               "blog_path", object.size(a),
                               "news_path", object.size(b),
                               "twitter_path", object.size(c)
                               )

imported_object_size
## # A tibble: 3 x 2
##   category     object_size    
##   <chr>        <objct_sz>     
## 1 blog_path    267758632 bytes
## 2 news_path    269840992 bytes
## 3 twitter_path 334484736 bytes
# explore the original file size
file_size <- lapply(all_object_path, function(a) {
  file.size(a)
}) %>% 
  bind_rows() %>% 
  pivot_longer(1:last_col()) %>% 
  rename(category = name, file_size = value)

Importing Dataset

As we can see from the previous exploration that when working with NLP, our biggest challenge is reserving space for the work space. We need to manage storage even better when we are dealing with NLP data set that requires more space and more comprehensive calculations. Hence, importing data will be a crucial thing that we need to consider about.

In this part, I will use just a little part of the data to be used as training data and delete unused space to reserve the storage. For this purpose, sampling method will be used in order to import the subset of the data as the training set.

set.seed(123)
sample_pct <- 0.1

blog_sample <- a %>% 
  .[sample(length(a), length(a) * sample_pct)]

news_sample <- b %>% 
  .[sample(length(b), length(b) * sample_pct)]

twitter_sample <- c %>% 
  .[sample(length(c), length(c) * sample_pct)]
sample_source <- bind_rows(
  data.frame(text_source = blog_sample, source = "blog"),
  data.frame(text_source = news_sample, source = "news"),
  data.frame(text_source = twitter_sample, source = "twitter")
)

Summarizing the Data Set

# explore the number of lines
length_of_line <- tribble(~category, ~length_of_line,
                          "blog_path", length(a),
                           "news_path", length(b),
                           "twitter_path", length(c)
                          )

# Total Character in each file
total_char <- tribble(~category, ~total_char,
                      "blog_path", sum(nchar(a)),
                       "news_path", sum(nchar(b)),
                       "twitter_path", sum(nchar(c))
                      )

# explore total words in each file
total_word <- tribble(~category, ~total_word,
                      "blog_path", sum(nchar(a)),
                       "news_path", sum(nchar(b)),
                       "twitter_path", sum(nchar(c))
                      )

# summarize all data
data_summary <- imported_object_size %>% 
  left_join(file_size, by = "category") %>% 
  left_join(length_of_line, by = "category") %>% 
  left_join(total_char, by = "category") %>% 
  left_join(total_word, by = "category") %>% 
  melt()
## Using category as id variables
## Warning: attributes are not identical across measure variables; they will be
## dropped
# display the table of the summary
data_summary
##        category       variable     value
## 1     blog_path    object_size 267758632
## 2     news_path    object_size 269840992
## 3  twitter_path    object_size 334484736
## 4     blog_path      file_size 210160014
## 5     news_path      file_size 205811889
## 6  twitter_path      file_size 167105338
## 7     blog_path length_of_line    899288
## 8     news_path length_of_line   1010242
## 9  twitter_path length_of_line   2360148
## 10    blog_path     total_char 206824505
## 11    news_path     total_char 203223159
## 12 twitter_path     total_char 162096031
## 13    blog_path     total_word 206824505
## 14    news_path     total_word 203223159
## 15 twitter_path     total_word 162096031
# Visualize the summary
data_summary %>% 
  ggplot(aes(x = category, y = value)) + 
  geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
  ggrepel::geom_label_repel(aes(label = value))

# cleaning up
rm(list = c("a", "b", "c", "blog_sample", "news_sample", "twitter_sample"))
gc(reset = TRUE)
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  2504332 133.8    8520061 455.1  2504332 133.8
## Vcells 16744595 127.8   90621362 691.4 16744595 127.8

From this visualization, we can see that there is something weird. We can see that twitter has the smallest file size, but has the largest imported object size. This is a very insightful for us to understand that the more lines of the record, then the more space needed when the object imported.

Cleaning Up

After importing the data set, then what we first need to do is clean it up.

# create profanity list
profanity <- read.csv("swears.csv", stringsAsFactors = FALSE) %>% 
  mutate(word = badwords)

# create stop word list
data("stop_words", package = "tidytext")

# create urls regular expression
url <- "[hH][Tt][Tt][Pp][^[:space:]]*"

# create filter for non alpha numeric
non_alph <- "[^[:alpha:][:space:]]*"

cleaning <- sample_source %>%
  mutate(text_source = str_replace_all(text_source, url, "")) %>% 
  mutate(text_source = str_replace_all(text_source, non_alph, "")) %>% 
  mutate(text_source = iconv(text_source, "ASCII//TRANSLIT"))

Deeper Exploration

So far, we have seen a brief summary about our data. Now, we need to do deeper exploration for our data.

repo <- cleaning %>% 
  unnest_tokens(word, text_source) %>% 
  anti_join(profanity) %>% 
  anti_join(stop_words)
## Joining, by = "word"
## Joining, by = "word"
# word frequencies
word_freq <- repo %>% 
  group_by(word) %>% 
  summarize(val = n())

# marginal distribution of word freq
hist(log(word_freq$val))

# covering 90% of words
prop_90 <- word_freq %>% 
  count(word) %>% 
  summarize(prop = n/sum(n)) %>% 
  arrange(prop) %>% 
  summarize(unique_coverage = cumsum(prop)) %>% 
  filter(unique_coverage <= 0.9)

prop_90 %>% 
  nrow()
## [1] 189747
# Marginal word distribution
word_freq %>% 
  top_n(15, val) %>% 
  ggplot(aes(reorder(word, val), val)) + 
  geom_col(fill = "dodgerblue") + 
  ggtitle("Main 15 Features")+
  labs(x = "Word", y = "Frequencies")+
  theme_light()