Turki 2022-06-17
This script uses the tidy data principles applied to text mining, as outlined in Text Mining with R: A Tidy Approach.
English Repository Files
blogs_file <- "./data/final/en_US/en_US.blogs.txt"
news_file <- "./data/final/en_US/en_US.news.txt"
twitter_file <- "./data/final/en_US/en_US.twitter.txt" File Sizes (Mb)
blogs_size <- file.size(blogs_file) / (2^20)
news_size <- file.size(news_file) / (2^20)
twitter_size <- file.size(twitter_file) / (2^20)Read the data files
blogs <- readLines(blogs_file, skipNul = TRUE)
news <- readLines(news_file, skipNul = TRUE)
twitter <- readLines(twitter_file, skipNul = TRUE)Number of Lines per file
blogs_lines <- length(blogs)
news_lines <- length(news)
twitter_lines <- length(twitter)
total_lines <- blogs_lines + news_lines + twitter_linesDistibution of characters per line, by file
blogs_nchar <- nchar(blogs)
news_nchar <- nchar(news)
twitter_nchar <- nchar(twitter)
boxplot(blogs_nchar, news_nchar, twitter_nchar, log = "y",
names = c("blogs", "news", "twitter"),
ylab = "log(Number of Characters)", xlab = "File Name")
title("Comparing Distributions of Chracters per Line")Total characters per file
blogs_nchar_sum <- sum(blogs_nchar)
news_nchar_sum <- sum(news_nchar)
twitter_nchar_sum <- sum(twitter_nchar)Total words per file
blogs_words <- wordcount(blogs, sep = " ")
news_words <- wordcount(news, sep = " ")
twitter_words <- wordcount(twitter, sep = " ")Create summary of repo stats
repo_summary <- data.frame(f_names = c("blogs", "news", "twitter"),
f_size = c(blogs_size, news_size, twitter_size),
f_lines = c(blogs_lines, news_lines, twitter_lines),
n_char = c(blogs_nchar_sum, news_nchar_sum, twitter_nchar_sum),
n_words = c(blogs_words, news_words, twitter_words))
repo_summary <- repo_summary %>% mutate(pct_n_char = round(n_char/sum(n_char), 2))
repo_summary <- repo_summary %>% mutate(pct_lines = round(f_lines/sum(f_lines), 2))
repo_summary <- repo_summary %>% mutate(pct_words = round(n_words/sum(n_words), 2))
kable(repo_summary)| f_names | f_size | f_lines | n_char | n_words | pct_n_char | pct_lines | pct_words |
|---|---|---|---|---|---|---|---|
| blogs | 200.4242 | 899288 | 206824505 | 37334131 | 0.36 | 0.21 | 0.37 |
| news | 196.2775 | 1010242 | 203223159 | 34372530 | 0.36 | 0.24 | 0.34 |
| 159.3641 | 2360148 | 162096241 | 30373583 | 0.28 | 0.55 | 0.30 |
saveRDS(repo_summary, "./data/final/en_US/repo_sample.rds")Read the data files into dataframes
blogs <- data_frame(text = blogs)## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
news <- data_frame(text = news)
twitter <- data_frame(text = twitter)set.seed(1001)
sample_pct <- 0.1
blogs_sample <- blogs %>%
sample_n(., nrow(blogs)*sample_pct)
news_sample <- news %>%
sample_n(., nrow(news)*sample_pct)
twitter_sample <- twitter %>%
sample_n(., nrow(twitter)*sample_pct)Create tidy repository
repo_sample <- bind_rows(mutate(blogs_sample, source = "blogs"),
mutate(news_sample, source = "news"),
mutate(twitter_sample, source = "twitter"))
repo_sample$source <- as.factor(repo_sample$source)Create filters: stopwords, profanity, non-alphanumeric’s, url’s, repeated letters(+3x)
data("stop_words")
swear_words <- read_delim("./data/final/en_US/swearWords.csv", delim = "\n", col_names = FALSE)## Rows: 452 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\001"
## chr (1): X1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
swear_words <- unnest_tokens(swear_words, word, X1)
replace_reg <- "[^[:alpha:][:space:]]*"
replace_url <- "http[^[:space:]]*"
replace_aaa <- "\\b(?=\\w*(\\w)\\1)\\w+\\b" Clean the sample. Cleaning is separted from tidying so
unnest_tokens function can be used for words, and
ngrams.
clean_sample <- repo_sample %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
mutate(text = str_replace_all(text, replace_url, "")) %>%
mutate(text = str_replace_all(text, replace_aaa, "")) %>%
mutate(text = iconv(text, "ASCII//TRANSLIT"))Clean up
rm(blogs, blogs_nchar, news, news_nchar, twitter, twitter_nchar, replace_reg, replace_url, replace_aaa)Create tidy dataframe for repo sample
tidy_repo <- clean_sample %>%
unnest_tokens(word, text) %>%
anti_join(swear_words) %>%
anti_join(stop_words)## Joining, by = "word"
## Joining, by = "word"
Word counts: Number of unique words in repo
(repo_count <- tidy_repo %>%
summarise(keys = n_distinct(word)))## # A tibble: 1 × 1
## keys
## <int>
## 1 155484
Number of words to attain 50% and 90% coverage of all words in repo
cover_50 <- tidy_repo %>%
count(word) %>%
mutate(proportion = n / sum(n)) %>%
arrange(desc(proportion)) %>%
mutate(coverage = cumsum(proportion)) %>%
filter(coverage <= 0.5)
nrow(cover_50)## [1] 1274
cover_90 <- tidy_repo %>%
count(word) %>%
mutate(proportion = n / sum(n)) %>%
arrange(desc(proportion)) %>%
mutate(coverage = cumsum(proportion)) %>%
filter(coverage <= 0.9)
nrow(cover_90)## [1] 16935
Word distribution
cover_90 %>%
top_n(20, proportion) %>%
mutate(word = reorder(word, proportion)) %>%
ggplot(aes(word, proportion)) +
geom_col() +
xlab(NULL) +
coord_flip()Word distribution by source
freq <- tidy_repo %>%
count(source, word) %>%
group_by(source) %>%
mutate(proportion = n / sum(n)) %>%
spread(source, proportion) %>%
gather(source, proportion, `blogs`:`twitter`) %>%
arrange(desc(proportion), desc(n))
freq %>%
filter(proportion > 0.002) %>%
mutate(word = reorder(word, proportion)) %>%
ggplot(aes(word, proportion)) +
geom_col() +
xlab(NULL) +
coord_flip() +
facet_grid(~source, scales = "free")Word cloud
cover_90 %>%
with(wordcloud(word, n, max.words = 100,
colors = brewer.pal(6, 'Dark2'), random.order = FALSE))saveRDS(tidy_repo, "./data/final/tidy_repo.rds")
saveRDS(cover_90, "./data/final/cover_90.rds")
rm(tidy_repo, cover_50, cover_90)Create bigrams by source using unnest_tokens
bigram_repo <- clean_sample %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)Number of bigrams to attain 90% coverage of all bigrams in repo
bigram_cover_90 <- bigram_repo %>%
count(bigram) %>%
mutate(proportion = n / sum(n)) %>%
arrange(desc(proportion)) %>%
mutate(coverage = cumsum(proportion)) %>%
filter(coverage <= 0.9)
nrow(bigram_cover_90)## [1] 1244584
Bigram distribution
bigram_cover_90 %>%
top_n(20, proportion) %>%
mutate(bigram = reorder(bigram, proportion)) %>%
ggplot(aes(bigram, proportion)) +
geom_col() +
xlab(NULL) +
coord_flip()saveRDS(bigram_cover_90, "./data/final/cover_90.rds")Create Trigrams by source using unnest_tokens
trigram_repo <- clean_sample %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3)Number of trigrams to attain 90% coverage of all trigrams in repo
trigram_cover_90 <- trigram_repo %>%
count(trigram) %>%
mutate(proportion = n / sum(n)) %>%
arrange(desc(proportion)) %>%
mutate(coverage = cumsum(proportion)) %>%
filter(coverage <= 0.9)
nrow(trigram_cover_90)## [1] 4446207
trigram distribution
trigram_cover_90 %>%
top_n(20, proportion) %>%
mutate(trigram = reorder(trigram, proportion)) %>%
ggplot(aes(trigram, proportion)) +
geom_col() +
xlab(NULL) +
coord_flip()saveRDS(trigram_cover_90, "./data/final/cover_90.rds")Create quadgrams by source using unnest_tokens
quadgram_repo <- clean_sample %>%
unnest_tokens(quadgram, text, token = "ngrams", n = 4)Number of quadgrams to attain 90% coverage of all quadgrams in repo
quadgram_cover_90 <- quadgram_repo %>%
count(quadgram) %>%
mutate(proportion = n / sum(n)) %>%
arrange(desc(proportion)) %>%
mutate(coverage = cumsum(proportion)) %>%
filter(coverage <= 0.9)
nrow(quadgram_cover_90)## [1] 6083973
quadgram distribution
quadgram_cover_90 %>%
top_n(20, proportion) %>%
mutate(quadgram = reorder(quadgram, proportion)) %>%
ggplot(aes(quadgram, proportion)) +
geom_col() +
xlab(NULL) +
coord_flip()quadgrams_separated <- quadgram_cover_90 %>%
separate(quadgram, c("word1", "word2", "word3", "word4"), sep = " ")
quadgrams_separated## # A tibble: 6,083,973 × 7
## word1 word2 word3 word4 n proportion coverage
## <chr> <chr> <chr> <chr> <int> <dbl> <dbl>
## 1 <NA> <NA> <NA> <NA> 43978 0.00578 0.00578
## 2 the end of the 740 0.0000973 0.00588
## 3 the rest of the 658 0.0000865 0.00597
## 4 for the first time 623 0.0000819 0.00605
## 5 at the end of 616 0.0000810 0.00613
## 6 at the same time 495 0.0000651 0.00619
## 7 is going to be 458 0.0000602 0.00625
## 8 is one of the 421 0.0000553 0.00631
## 9 one of the most 401 0.0000527 0.00636
## 10 when it comes to 388 0.0000510 0.00641
## # … with 6,083,963 more rows
saveRDS(quadgram_cover_90, "./data/final/cover_90.rds")
end <- Sys.time()
(run_time <- end - start_time)## Time difference of 18.07368 mins
sessionInfo() ## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ngram_3.2.1 wordcloud_2.6 RColorBrewer_1.1-3 knitr_1.39
## [5] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.9 purrr_0.3.4
## [9] readr_2.1.2 tidyr_1.2.0 tibble_3.1.7 ggplot2_3.3.6
## [13] tidyverse_1.3.1 tidytext_0.3.3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.8.3 lubridate_1.8.0 lattice_0.20-45 assertthat_0.2.1
## [5] digest_0.6.29 utf8_1.2.2 R6_2.5.1 cellranger_1.1.0
## [9] backports_1.4.1 reprex_2.0.1 evaluate_0.15 httr_1.4.3
## [13] highr_0.9 pillar_1.7.0 rlang_1.0.2 readxl_1.4.0
## [17] rstudioapi_0.13 Matrix_1.4-1 rmarkdown_2.14 labeling_0.4.2
## [21] bit_4.0.4 munsell_0.5.0 broom_0.8.0 compiler_4.2.0
## [25] janeaustenr_0.1.5 modelr_0.1.8 xfun_0.31 pkgconfig_2.0.3
## [29] htmltools_0.5.2 tidyselect_1.1.2 fansi_1.0.3 crayon_1.5.1
## [33] tzdb_0.3.0 dbplyr_2.2.0 withr_2.5.0 SnowballC_0.7.0
## [37] grid_4.2.0 jsonlite_1.8.0 gtable_0.3.0 lifecycle_1.0.1
## [41] DBI_1.1.2 magrittr_2.0.3 scales_1.2.0 tokenizers_0.2.1
## [45] vroom_1.5.7 cli_3.3.0 stringi_1.7.6 farver_2.1.0
## [49] fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2 generics_0.1.2
## [53] vctrs_0.4.1 tools_4.2.0 bit64_4.0.5 glue_1.6.2
## [57] hms_1.1.1 parallel_4.2.0 fastmap_1.1.0 yaml_2.3.5
## [61] colorspace_2.0-3 rvest_1.0.2 haven_2.5.0