Task 1: Getting and Cleaning the Data

Turki 2022-06-17

Download and explore the data

Create a data directory

if (!file.exists("data")) {
  dir.create("data")
}

Download the data

# url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# download(url, dest="dataset.zip", mode="wb") 
# unzip ("dataset.zip", exdir = "./data")

English Repository Files

blogs_file   <- "./data/final/en_US/en_US.blogs.txt"
news_file    <- "./data/final/en_US/en_US.news.txt"
twitter_file <- "./data/final/en_US/en_US.twitter.txt"  

File Sizes (Mb)

blogs_size   <- file.size(blogs_file) / (2^20)
news_size    <- file.size(news_file) / (2^20)
twitter_size <- file.size(twitter_file) / (2^20)

Read the data files

blogs   <- read_lines(blogs_file)
news    <- read_lines(news_file)
twitter <- read_lines(twitter_file) 
## Warning: One or more parsing issues, see `problems()` for details

Number of Lines per file

blogs_lines   <- length(blogs)
news_lines    <- length(news)
twitter_lines <- length(twitter)
total_lines   <- blogs_lines + news_lines + twitter_lines

Distibution of characters per line, by file

blogs_nchar   <- nchar(blogs)
news_nchar    <- nchar(news)
twitter_nchar <- nchar(twitter)

boxplot(blogs_nchar, news_nchar, twitter_nchar, log = "y",
        names = c("blogs", "news", "twitter"),
        ylab = "log(Number of Characters)", xlab = "File Name") 
        title("Comparing Distributions of Chracters per Line")

Total characters per file

blogs_nchar_sum   <- sum(blogs_nchar)
news_nchar_sum    <- sum(news_nchar)
twitter_nchar_sum <- sum(twitter_nchar)

Total words per file

blogs_words <- wordcount(blogs, sep = " ")
news_words  <- wordcount(news,  sep = " ")
twitter_words <- wordcount(news, sep = " ")

Create summary of repo stats

repo_summary <- data.frame(f_names = c("blogs", "news", "twitter"),
                           f_size  = c(blogs_size, news_size, twitter_size),
                           f_lines = c(blogs_lines, news_lines, twitter_lines),
                           n_char =  c(blogs_nchar_sum, news_nchar_sum, twitter_nchar_sum),
                           n_words = c(blogs_words, news_words, twitter_words))
repo_summary <- repo_summary %>% mutate(pct_n_char = round(n_char/sum(n_char), 2))
repo_summary <- repo_summary %>% mutate(pct_lines = round(f_lines/sum(f_lines), 2))
repo_summary <- repo_summary %>% mutate(pct_words = round(n_words/sum(n_words), 2))
kable(repo_summary)
f_names f_size f_lines n_char n_words pct_n_char pct_lines pct_words
blogs 200.4242 899288 206824505 37334131 0.36 0.21 0.35
news 196.2775 1010242 203223159 34372530 0.36 0.24 0.32
twitter 159.3641 2360148 162096031 34372530 0.28 0.55 0.32

Sample the data and save the sample

Compute sample sizes in terms of lines

sample_pct = 0.05
set.seed(1001)
blogs_size   <- blogs_lines * sample_pct
news_size    <- news_lines * sample_pct
twitter_size <- twitter_lines * sample_pct

Create samples

blogs_sample   <- sample(blogs, blogs_size)
news_sample    <- sample(news, news_size)
twitter_sample <- sample(twitter, twitter_size)
repo_sample    <- c(blogs_sample, news_sample, twitter_sample)

Save sample

writeLines(repo_sample, "./data/final/en_US/en_US.repo_sample.txt")
saveRDS(repo_sample, file = "./data/final/en_US/repo_sample.rds" )

Clean the sample data

Use tm to create and clean the corpus

clean_sample <- Corpus(VectorSource(repo_sample))
print(as.character(clean_sample[[1]]))
## [1] "***As many of you have pointed out, you don't need to spend $20 on ONE diaper. You can buy a dozen prefolds and 3 covers for under $50...and that would be enough diapers to get you through an entire day."

Remove URL’s
Source: R and Data Mining

removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(removeURL)):
## transformation drops documents
# Remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(clean_sample,
## content_transformer(removeNumPunct)): transformation drops documents

Transform sample to all lower case

clean_sample <- tm_map(clean_sample, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(tolower)):
## transformation drops documents

Create profanity filter
Source: List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words

profanity <- read.table("./data/final/en_US/profanity.txt", header = FALSE, sep ="\n")

Remove profanity

clean_sample <- tm_map(clean_sample, removeWords, profanity[,1])
## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, profanity[, 1]):
## transformation drops documents

Remove stopwords

clean_sample <- tm_map(clean_sample, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("english")):
## transformation drops documents
clean_sample <- tm_map(clean_sample, removeWords, stopwords("SMART"))
## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("SMART")):
## transformation drops documents
print(as.character(clean_sample[[1]]))
## [1] "     pointed   dont   spend    diaper   buy  dozen prefolds   covers        diapers      entire day"

Remove Whitespace

clean_sample <- tm_map(clean_sample, stripWhitespace)
## Warning in tm_map.SimpleCorpus(clean_sample, stripWhitespace): transformation
## drops documents
print(as.character(clean_sample[[1]]))
## [1] " pointed dont spend diaper buy dozen prefolds covers diapers entire day"

Save clean corpus

saveRDS(clean_sample, file = "./data/final/en_US/clean_sample.rds" )

Initial Exploratory Data Analysis

Convert to document term matrix

docterm_corpus <- DocumentTermMatrix(clean_sample)
dim(docterm_corpus)
## [1] 213483 141824
new_docterm_corpus <- removeSparseTerms(docterm_corpus,sparse = 0.993)
dim(new_docterm_corpus)
## [1] 213483    106

Find frequent terms

colS <- colSums(as.matrix(new_docterm_corpus))
length(colS)
## [1] 106
doc_features <- data.table(name = attributes(colS)$names, count = colS)

Most frequent and least frequent words

doc_features[order(-count)][1:10] #top 10 most frequent words
##       name count
##  1:   time 10602
##  2:   good  8847
##  3:   dont  8733
##  4:    day  8307
##  5: people  7983
##  6:   love  7964
##  7:   back  6817
##  8:   make  6594
##  9:  great  6169
## 10:   year  5748
doc_features[order(count)][1:10] #least 10 frequent words
##         name count
##  1:      ago  1563
##  2:      bit  1602
##  3:    check  1608
##  4:  started  1628
##  5:     room  1647
##  6:     stop  1664
##  7:   person  1666
##  8: business  1676
##  9:   friday  1688
## 10:   public  1691

Plot most frequent terms

ggplot(doc_features[count>5000],aes(name, count)) +
  geom_bar(stat = "identity",fill='lightblue',color='black') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_economist() + scale_color_economist() 

Create word cloud

wordcloud(names(colS), colS, min.freq = 500, 
          colors = brewer.pal(6, 'Dark2'), random.order = FALSE)  

wordcloud(names(colS), colS, min.freq = 2000, 
          colors = brewer.pal(6, 'Dark2'), random.order = FALSE)  

end <- Sys.time()
(ellapsed <- end - start)
## Time difference of 3.019779 mins