Introduction

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm.

Data Loading and Summarizing

English Repository Files

File Sizes (Megabytes)

blogs_size   <- file.size(blogs_file) / (2^20)
news_size    <- file.size(news_file) / (2^20)
twitter_size <- file.size(twitter_file) / (2^20)
#blogs_size
paste(blogs_size, "MB")
## [1] "200.424207687378 MB"
#news_size
paste(news_size, "MB")
## [1] "196.277512550354 MB"
#twitter_size 
paste(twitter_size, "MB")
## [1] "159.364068984985 MB"

Read the data files into dataframes

blogs   <- readLines(blogs_file, skipNul = TRUE)
news    <- readLines(news_file,  skipNul = TRUE)
twitter <- readLines(twitter_file, skipNul = TRUE)

Number of Lines per file

blogs_lines   <- length(blogs)
news_lines    <- length(news)
twitter_lines <- length(twitter)
total_lines   <- blogs_lines + news_lines + twitter_lines

Distibution of characters per line, by file

blogs_nchar   <- nchar(blogs)
news_nchar    <- nchar(news)
twitter_nchar <- nchar(twitter)

boxplot(blogs_nchar, news_nchar, twitter_nchar, log = "y",
        names = c("blogs", "news", "twitter"),
        ylab = "log(Number of Characters)", xlab = "File Name") 
        title("Comparing Distributions of Chracters per Line")

Total characters per file

blogs_nchar_sum   <- sum(blogs_nchar)
news_nchar_sum    <- sum(news_nchar)
twitter_nchar_sum <- sum(twitter_nchar)

Total words per file

blogs_words <- wordcount(blogs, sep = " ")
news_words  <- wordcount(news,  sep = " ")
twitter_words <- wordcount(news, sep = " ")

Summary of repo stats

repo_summary <- data.frame(f_names = c("blogs", "news", "twitter"),
                           f_size  = c(blogs_size, news_size, twitter_size),
                           f_lines = c(blogs_lines, news_lines, twitter_lines),
                           n_char =  c(blogs_nchar_sum, news_nchar_sum, twitter_nchar_sum),
                           n_words = c(blogs_words, news_words, twitter_words))
repo_summary <- repo_summary %>% mutate(pct_n_char = round(n_char/sum(n_char), 2))
repo_summary <- repo_summary %>% mutate(pct_lines = round(f_lines/sum(f_lines), 2))
repo_summary <- repo_summary %>% mutate(pct_words = round(n_words/sum(n_words), 2))
kable(repo_summary)
f_names f_size f_lines n_char n_words pct_n_char pct_lines pct_words
blogs 200.4242 899288 206824505 37334131 0.36 0.21 0.35
news 196.2775 1010242 203223159 34372530 0.36 0.24 0.32
twitter 159.3641 2360148 162096241 34372530 0.28 0.55 0.32

Data Cleaning and Sampling Steps

Calculate sample sizes in terms of lines

sample_pct = 0.05
set.seed(1001)
blogs_size   <- blogs_lines * sample_pct
news_size    <- news_lines * sample_pct
twitter_size <- twitter_lines * sample_pct

Create repo samples

blogs_sample   <- sample(blogs, blogs_size)
news_sample    <- sample(news, news_size)
twitter_sample <- sample(twitter, twitter_size)
repo_sample    <- c(blogs_sample, news_sample, twitter_sample)

Save sample

writeLines(repo_sample, "~/Desktop/Capstone/data/final/en_US/en_US.repo_sample.txt")
saveRDS(repo_sample, file = "~/Desktop/Capstone/data/final/en_US/repo_sample.rds" )

Clean the sample data

clean_sample <- Corpus(VectorSource(repo_sample))
print(as.character(clean_sample[[1]]))
## [1] "***As many of you have pointed out, you don't need to spend $20 on ONE diaper. You can buy a dozen prefolds and 3 covers for under $50...and that would be enough diapers to get you through an entire day."

Remove URL’s

removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(removeURL)):
## transformation drops documents

Remove anything other than English letters or space

removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(clean_sample,
## content_transformer(removeNumPunct)): transformation drops documents

Transform sample to all lower case

clean_sample <- tm_map(clean_sample, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(tolower)):
## transformation drops documents

Create Stop Words filter (Profanity Bad Words)

Source: Kaggle Bad-Bad-Words Dataset

profanity <- read.table("~/Desktop/Capstone/data/final/en_US/stopWords/bad-words.csv", header = FALSE, sep ="\n")

Remove bad stop words

clean_sample <- tm_map(clean_sample, removeWords, profanity[,1])
## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, profanity[, 1]):
## transformation drops documents

Remove stopwords

clean_sample <- tm_map(clean_sample, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("english")):
## transformation drops documents
clean_sample <- tm_map(clean_sample, removeWords, stopwords("SMART"))
## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("SMART")):
## transformation drops documents
print(as.character(clean_sample[[1]]))
## [1] "     pointed   dont   spend    diaper   buy  dozen prefolds   covers        diapers      entire day"

Remove Whitespace

clean_sample <- tm_map(clean_sample, stripWhitespace)
## Warning in tm_map.SimpleCorpus(clean_sample, stripWhitespace): transformation
## drops documents
print(as.character(clean_sample[[1]]))
## [1] " pointed dont spend diaper buy dozen prefolds covers diapers entire day"

Save clean corpus

saveRDS(clean_sample, file = "~/Desktop/Capstone/data/final/en_US/clean_sample.rds" )

Initial Exploratory Data Analysis

Convert to document term matrix

docterm_corpus <- DocumentTermMatrix(clean_sample)
dim(docterm_corpus)
## [1] 213483 141522
new_docterm_corpus <- removeSparseTerms(docterm_corpus,sparse = 0.993)
dim(new_docterm_corpus)
## [1] 213483    106

#’ Find frequent terms

colS <- colSums(as.matrix(new_docterm_corpus))
length(colS)
## [1] 106
doc_features <- data.table(name = attributes(colS)$names, count = colS)

Most frequent words

doc_features[order(-count)][1:10] #top 10 most frequent words
##       name count
##  1:   time 10602
##  2:   good  8847
##  3:   dont  8733
##  4:    day  8307
##  5: people  7983
##  6:   love  7964
##  7:   back  6817
##  8:   make  6597
##  9:  great  6169
## 10:   year  5748

Most least frequent words

doc_features[order(count)][1:10] #least 10 frequent words
##         name count
##  1:      ago  1563
##  2:      bit  1602
##  3:    check  1608
##  4:  started  1628
##  5:     room  1647
##  6:     stop  1664
##  7:   person  1666
##  8: business  1676
##  9:   friday  1688
## 10:   public  1691

Plot of most frequent terms

ggplot(doc_features[count>5000],aes(name, count)) +
  geom_bar(stat = "identity",fill='lightblue',color='black') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_economist() + scale_color_economist() 

## Create word clouds ## Data minimum frequent 500 words

wordcloud(names(colS), colS, min.freq = 500, 
          colors = brewer.pal(6, 'Dark2'), random.order = FALSE)  

Data minimum frequent 2000 words

wordcloud(names(colS), colS, min.freq = 2000, 
          colors = brewer.pal(6, 'Dark2'), random.order = FALSE)