The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm.
English Repository Files
blogs_size <- file.size(blogs_file) / (2^20)
news_size <- file.size(news_file) / (2^20)
twitter_size <- file.size(twitter_file) / (2^20)
#blogs_size
paste(blogs_size, "MB")
## [1] "200.424207687378 MB"
#news_size
paste(news_size, "MB")
## [1] "196.277512550354 MB"
#twitter_size
paste(twitter_size, "MB")
## [1] "159.364068984985 MB"
Read the data files into dataframes
blogs <- readLines(blogs_file, skipNul = TRUE)
news <- readLines(news_file, skipNul = TRUE)
twitter <- readLines(twitter_file, skipNul = TRUE)
blogs_lines <- length(blogs)
news_lines <- length(news)
twitter_lines <- length(twitter)
total_lines <- blogs_lines + news_lines + twitter_lines
blogs_nchar <- nchar(blogs)
news_nchar <- nchar(news)
twitter_nchar <- nchar(twitter)
boxplot(blogs_nchar, news_nchar, twitter_nchar, log = "y",
names = c("blogs", "news", "twitter"),
ylab = "log(Number of Characters)", xlab = "File Name")
title("Comparing Distributions of Chracters per Line")
blogs_nchar_sum <- sum(blogs_nchar)
news_nchar_sum <- sum(news_nchar)
twitter_nchar_sum <- sum(twitter_nchar)
blogs_words <- wordcount(blogs, sep = " ")
news_words <- wordcount(news, sep = " ")
twitter_words <- wordcount(news, sep = " ")
repo_summary <- data.frame(f_names = c("blogs", "news", "twitter"),
f_size = c(blogs_size, news_size, twitter_size),
f_lines = c(blogs_lines, news_lines, twitter_lines),
n_char = c(blogs_nchar_sum, news_nchar_sum, twitter_nchar_sum),
n_words = c(blogs_words, news_words, twitter_words))
repo_summary <- repo_summary %>% mutate(pct_n_char = round(n_char/sum(n_char), 2))
repo_summary <- repo_summary %>% mutate(pct_lines = round(f_lines/sum(f_lines), 2))
repo_summary <- repo_summary %>% mutate(pct_words = round(n_words/sum(n_words), 2))
kable(repo_summary)
| f_names | f_size | f_lines | n_char | n_words | pct_n_char | pct_lines | pct_words |
|---|---|---|---|---|---|---|---|
| blogs | 200.4242 | 899288 | 206824505 | 37334131 | 0.36 | 0.21 | 0.35 |
| news | 196.2775 | 1010242 | 203223159 | 34372530 | 0.36 | 0.24 | 0.32 |
| 159.3641 | 2360148 | 162096241 | 34372530 | 0.28 | 0.55 | 0.32 |
Calculate sample sizes in terms of lines
sample_pct = 0.05
set.seed(1001)
blogs_size <- blogs_lines * sample_pct
news_size <- news_lines * sample_pct
twitter_size <- twitter_lines * sample_pct
blogs_sample <- sample(blogs, blogs_size)
news_sample <- sample(news, news_size)
twitter_sample <- sample(twitter, twitter_size)
repo_sample <- c(blogs_sample, news_sample, twitter_sample)
writeLines(repo_sample, "~/Desktop/Capstone/data/final/en_US/en_US.repo_sample.txt")
saveRDS(repo_sample, file = "~/Desktop/Capstone/data/final/en_US/repo_sample.rds" )
clean_sample <- Corpus(VectorSource(repo_sample))
print(as.character(clean_sample[[1]]))
## [1] "***As many of you have pointed out, you don't need to spend $20 on ONE diaper. You can buy a dozen prefolds and 3 covers for under $50...and that would be enough diapers to get you through an entire day."
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(removeURL)):
## transformation drops documents
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(clean_sample,
## content_transformer(removeNumPunct)): transformation drops documents
clean_sample <- tm_map(clean_sample, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(tolower)):
## transformation drops documents
Source: Kaggle Bad-Bad-Words Dataset
profanity <- read.table("~/Desktop/Capstone/data/final/en_US/stopWords/bad-words.csv", header = FALSE, sep ="\n")
clean_sample <- tm_map(clean_sample, removeWords, profanity[,1])
## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, profanity[, 1]):
## transformation drops documents
clean_sample <- tm_map(clean_sample, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("english")):
## transformation drops documents
clean_sample <- tm_map(clean_sample, removeWords, stopwords("SMART"))
## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("SMART")):
## transformation drops documents
print(as.character(clean_sample[[1]]))
## [1] " pointed dont spend diaper buy dozen prefolds covers diapers entire day"
clean_sample <- tm_map(clean_sample, stripWhitespace)
## Warning in tm_map.SimpleCorpus(clean_sample, stripWhitespace): transformation
## drops documents
print(as.character(clean_sample[[1]]))
## [1] " pointed dont spend diaper buy dozen prefolds covers diapers entire day"
saveRDS(clean_sample, file = "~/Desktop/Capstone/data/final/en_US/clean_sample.rds" )
Convert to document term matrix
docterm_corpus <- DocumentTermMatrix(clean_sample)
dim(docterm_corpus)
## [1] 213483 141522
new_docterm_corpus <- removeSparseTerms(docterm_corpus,sparse = 0.993)
dim(new_docterm_corpus)
## [1] 213483 106
#’ Find frequent terms
colS <- colSums(as.matrix(new_docterm_corpus))
length(colS)
## [1] 106
doc_features <- data.table(name = attributes(colS)$names, count = colS)
doc_features[order(-count)][1:10] #top 10 most frequent words
## name count
## 1: time 10602
## 2: good 8847
## 3: dont 8733
## 4: day 8307
## 5: people 7983
## 6: love 7964
## 7: back 6817
## 8: make 6597
## 9: great 6169
## 10: year 5748
doc_features[order(count)][1:10] #least 10 frequent words
## name count
## 1: ago 1563
## 2: bit 1602
## 3: check 1608
## 4: started 1628
## 5: room 1647
## 6: stop 1664
## 7: person 1666
## 8: business 1676
## 9: friday 1688
## 10: public 1691
ggplot(doc_features[count>5000],aes(name, count)) +
geom_bar(stat = "identity",fill='lightblue',color='black') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_economist() + scale_color_economist()
## Create word clouds ## Data minimum frequent 500 words
wordcloud(names(colS), colS, min.freq = 500,
colors = brewer.pal(6, 'Dark2'), random.order = FALSE)
wordcloud(names(colS), colS, min.freq = 2000,
colors = brewer.pal(6, 'Dark2'), random.order = FALSE)