Turki 2022-06-17
Create a data directory
if (!file.exists("data")) {
dir.create("data")
}Download the data
# url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# download(url, dest="dataset.zip", mode="wb")
# unzip ("dataset.zip", exdir = "./data")English Repository Files
blogs_file <- "./data/final/en_US/en_US.blogs.txt"
news_file <- "./data/final/en_US/en_US.news.txt"
twitter_file <- "./data/final/en_US/en_US.twitter.txt" File Sizes (Mb)
blogs_size <- file.size(blogs_file) / (2^20)
news_size <- file.size(news_file) / (2^20)
twitter_size <- file.size(twitter_file) / (2^20)Read the data files
blogs <- read_lines(blogs_file)
news <- read_lines(news_file)
twitter <- read_lines(twitter_file) ## Warning: One or more parsing issues, see `problems()` for details
Number of Lines per file
blogs_lines <- length(blogs)
news_lines <- length(news)
twitter_lines <- length(twitter)
total_lines <- blogs_lines + news_lines + twitter_linesDistibution of characters per line, by file
blogs_nchar <- nchar(blogs)
news_nchar <- nchar(news)
twitter_nchar <- nchar(twitter)
boxplot(blogs_nchar, news_nchar, twitter_nchar, log = "y",
names = c("blogs", "news", "twitter"),
ylab = "log(Number of Characters)", xlab = "File Name")
title("Comparing Distributions of Chracters per Line")Total characters per file
blogs_nchar_sum <- sum(blogs_nchar)
news_nchar_sum <- sum(news_nchar)
twitter_nchar_sum <- sum(twitter_nchar)Total words per file
blogs_words <- wordcount(blogs, sep = " ")
news_words <- wordcount(news, sep = " ")
twitter_words <- wordcount(news, sep = " ")Create summary of repo stats
repo_summary <- data.frame(f_names = c("blogs", "news", "twitter"),
f_size = c(blogs_size, news_size, twitter_size),
f_lines = c(blogs_lines, news_lines, twitter_lines),
n_char = c(blogs_nchar_sum, news_nchar_sum, twitter_nchar_sum),
n_words = c(blogs_words, news_words, twitter_words))
repo_summary <- repo_summary %>% mutate(pct_n_char = round(n_char/sum(n_char), 2))
repo_summary <- repo_summary %>% mutate(pct_lines = round(f_lines/sum(f_lines), 2))
repo_summary <- repo_summary %>% mutate(pct_words = round(n_words/sum(n_words), 2))
kable(repo_summary)| f_names | f_size | f_lines | n_char | n_words | pct_n_char | pct_lines | pct_words |
|---|---|---|---|---|---|---|---|
| blogs | 200.4242 | 899288 | 206824505 | 37334131 | 0.36 | 0.21 | 0.35 |
| news | 196.2775 | 1010242 | 203223159 | 34372530 | 0.36 | 0.24 | 0.32 |
| 159.3641 | 2360148 | 162096031 | 34372530 | 0.28 | 0.55 | 0.32 |
Compute sample sizes in terms of lines
sample_pct = 0.05
set.seed(1001)
blogs_size <- blogs_lines * sample_pct
news_size <- news_lines * sample_pct
twitter_size <- twitter_lines * sample_pctCreate samples
blogs_sample <- sample(blogs, blogs_size)
news_sample <- sample(news, news_size)
twitter_sample <- sample(twitter, twitter_size)
repo_sample <- c(blogs_sample, news_sample, twitter_sample)Save sample
writeLines(repo_sample, "./data/final/en_US/en_US.repo_sample.txt")
saveRDS(repo_sample, file = "./data/final/en_US/repo_sample.rds" )Use tm to create and clean the corpus
clean_sample <- Corpus(VectorSource(repo_sample))
print(as.character(clean_sample[[1]]))## [1] "***As many of you have pointed out, you don't need to spend $20 on ONE diaper. You can buy a dozen prefolds and 3 covers for under $50...and that would be enough diapers to get you through an entire day."
Remove URL’s
Source: R
and Data Mining
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeURL))## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(removeURL)):
## transformation drops documents
# Remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeNumPunct))## Warning in tm_map.SimpleCorpus(clean_sample,
## content_transformer(removeNumPunct)): transformation drops documents
Transform sample to all lower case
clean_sample <- tm_map(clean_sample, content_transformer(tolower))## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(tolower)):
## transformation drops documents
Create profanity filter
Source: List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
profanity <- read.table("./data/final/en_US/profanity.txt", header = FALSE, sep ="\n")Remove profanity
clean_sample <- tm_map(clean_sample, removeWords, profanity[,1])## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, profanity[, 1]):
## transformation drops documents
Remove stopwords
clean_sample <- tm_map(clean_sample, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("english")):
## transformation drops documents
clean_sample <- tm_map(clean_sample, removeWords, stopwords("SMART"))## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("SMART")):
## transformation drops documents
print(as.character(clean_sample[[1]]))## [1] " pointed dont spend diaper buy dozen prefolds covers diapers entire day"
Remove Whitespace
clean_sample <- tm_map(clean_sample, stripWhitespace)## Warning in tm_map.SimpleCorpus(clean_sample, stripWhitespace): transformation
## drops documents
print(as.character(clean_sample[[1]]))## [1] " pointed dont spend diaper buy dozen prefolds covers diapers entire day"
Save clean corpus
saveRDS(clean_sample, file = "./data/final/en_US/clean_sample.rds" )Convert to document term matrix
docterm_corpus <- DocumentTermMatrix(clean_sample)
dim(docterm_corpus)## [1] 213483 141824
new_docterm_corpus <- removeSparseTerms(docterm_corpus,sparse = 0.993)
dim(new_docterm_corpus)## [1] 213483 106
Find frequent terms
colS <- colSums(as.matrix(new_docterm_corpus))
length(colS)## [1] 106
doc_features <- data.table(name = attributes(colS)$names, count = colS)Most frequent and least frequent words
doc_features[order(-count)][1:10] #top 10 most frequent words## name count
## 1: time 10602
## 2: good 8847
## 3: dont 8733
## 4: day 8307
## 5: people 7983
## 6: love 7964
## 7: back 6817
## 8: make 6594
## 9: great 6169
## 10: year 5748
doc_features[order(count)][1:10] #least 10 frequent words## name count
## 1: ago 1563
## 2: bit 1602
## 3: check 1608
## 4: started 1628
## 5: room 1647
## 6: stop 1664
## 7: person 1666
## 8: business 1676
## 9: friday 1688
## 10: public 1691
Plot most frequent terms
ggplot(doc_features[count>5000],aes(name, count)) +
geom_bar(stat = "identity",fill='lightblue',color='black') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_economist() + scale_color_economist() Create word cloud
wordcloud(names(colS), colS, min.freq = 500,
colors = brewer.pal(6, 'Dark2'), random.order = FALSE) wordcloud(names(colS), colS, min.freq = 2000,
colors = brewer.pal(6, 'Dark2'), random.order = FALSE) end <- Sys.time()
(ellapsed <- end - start)## Time difference of 3.019779 mins