Task 1: Getting and Cleaning the Data

Turki 2022-06-17

Download and explore the data
Sample the data and save the sample
Clean the sample data
Initial Exploratory Data Analysis

Download and explore the data

Create a data directory

if (!file.exists("data")) {
  dir.create("data")
}

Download the data

# url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# download(url, dest="dataset.zip", mode="wb") 
# unzip ("dataset.zip", exdir = "./data")

English Repository Files

blogs_file   <- "./data/final/en_US/en_US.blogs.txt"
news_file    <- "./data/final/en_US/en_US.news.txt"
twitter_file <- "./data/final/en_US/en_US.twitter.txt"

File Sizes (Mb)

blogs_size   <- file.size(blogs_file) / (2^20)
news_size    <- file.size(news_file) / (2^20)
twitter_size <- file.size(twitter_file) / (2^20)

Read the data files

blogs   <- read_lines(blogs_file)
news    <- read_lines(news_file)
twitter <- read_lines(twitter_file)

## Warning: One or more parsing issues, see `problems()` for details

Number of Lines per file

blogs_lines   <- length(blogs)
news_lines    <- length(news)
twitter_lines <- length(twitter)
total_lines   <- blogs_lines + news_lines + twitter_lines

Distibution of characters per line, by file

blogs_nchar   <- nchar(blogs)
news_nchar    <- nchar(news)
twitter_nchar <- nchar(twitter)

boxplot(blogs_nchar, news_nchar, twitter_nchar, log = "y",
        names = c("blogs", "news", "twitter"),
        ylab = "log(Number of Characters)", xlab = "File Name") 
        title("Comparing Distributions of Chracters per Line")

Total characters per file

blogs_nchar_sum   <- sum(blogs_nchar)
news_nchar_sum    <- sum(news_nchar)
twitter_nchar_sum <- sum(twitter_nchar)

Total words per file

blogs_words <- wordcount(blogs, sep = " ")
news_words  <- wordcount(news,  sep = " ")
twitter_words <- wordcount(news, sep = " ")

Create summary of repo stats

repo_summary <- data.frame(f_names = c("blogs", "news", "twitter"),
                           f_size  = c(blogs_size, news_size, twitter_size),
                           f_lines = c(blogs_lines, news_lines, twitter_lines),
                           n_char =  c(blogs_nchar_sum, news_nchar_sum, twitter_nchar_sum),
                           n_words = c(blogs_words, news_words, twitter_words))
repo_summary <- repo_summary %>% mutate(pct_n_char = round(n_char/sum(n_char), 2))
repo_summary <- repo_summary %>% mutate(pct_lines = round(f_lines/sum(f_lines), 2))
repo_summary <- repo_summary %>% mutate(pct_words = round(n_words/sum(n_words), 2))
kable(repo_summary)

f_names	f_size	f_lines	n_char	n_words	pct_n_char	pct_lines	pct_words
blogs	200.4242	899288	206824505	37334131	0.36	0.21	0.35
news	196.2775	1010242	203223159	34372530	0.36	0.24	0.32
twitter	159.3641	2360148	162096031	34372530	0.28	0.55	0.32

Sample the data and save the sample

Compute sample sizes in terms of lines

sample_pct = 0.05
set.seed(1001)
blogs_size   <- blogs_lines * sample_pct
news_size    <- news_lines * sample_pct
twitter_size <- twitter_lines * sample_pct

Create samples

blogs_sample   <- sample(blogs, blogs_size)
news_sample    <- sample(news, news_size)
twitter_sample <- sample(twitter, twitter_size)
repo_sample    <- c(blogs_sample, news_sample, twitter_sample)

Save sample

writeLines(repo_sample, "./data/final/en_US/en_US.repo_sample.txt")
saveRDS(repo_sample, file = "./data/final/en_US/repo_sample.rds" )

Clean the sample data

Use tm to create and clean the corpus

clean_sample <- Corpus(VectorSource(repo_sample))
print(as.character(clean_sample[[1]]))

## [1] "***As many of you have pointed out, you don't need to spend $20 on ONE diaper. You can buy a dozen prefolds and 3 covers for under $50...and that would be enough diapers to get you through an entire day."

Remove URL’s
Source: R and Data Mining

removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeURL))

## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(removeURL)):
## transformation drops documents

# Remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
clean_sample <- tm_map(clean_sample, content_transformer(removeNumPunct))

## Warning in tm_map.SimpleCorpus(clean_sample,
## content_transformer(removeNumPunct)): transformation drops documents

Transform sample to all lower case

clean_sample <- tm_map(clean_sample, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(clean_sample, content_transformer(tolower)):
## transformation drops documents

Create profanity filter
Source: List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words

profanity <- read.table("./data/final/en_US/profanity.txt", header = FALSE, sep ="\n")

Remove profanity

clean_sample <- tm_map(clean_sample, removeWords, profanity[,1])

## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, profanity[, 1]):
## transformation drops documents

Remove stopwords

clean_sample <- tm_map(clean_sample, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("english")):
## transformation drops documents

clean_sample <- tm_map(clean_sample, removeWords, stopwords("SMART"))

## Warning in tm_map.SimpleCorpus(clean_sample, removeWords, stopwords("SMART")):
## transformation drops documents

print(as.character(clean_sample[[1]]))

## [1] "     pointed   dont   spend    diaper   buy  dozen prefolds   covers        diapers      entire day"

Remove Whitespace

clean_sample <- tm_map(clean_sample, stripWhitespace)

## Warning in tm_map.SimpleCorpus(clean_sample, stripWhitespace): transformation
## drops documents

print(as.character(clean_sample[[1]]))

## [1] " pointed dont spend diaper buy dozen prefolds covers diapers entire day"

Save clean corpus

saveRDS(clean_sample, file = "./data/final/en_US/clean_sample.rds" )

Initial Exploratory Data Analysis

Convert to document term matrix

docterm_corpus <- DocumentTermMatrix(clean_sample)
dim(docterm_corpus)

## [1] 213483 141824

new_docterm_corpus <- removeSparseTerms(docterm_corpus,sparse = 0.993)
dim(new_docterm_corpus)

## [1] 213483    106

Find frequent terms

colS <- colSums(as.matrix(new_docterm_corpus))
length(colS)

## [1] 106

doc_features <- data.table(name = attributes(colS)$names, count = colS)

Most frequent and least frequent words

doc_features[order(-count)][1:10] #top 10 most frequent words

##       name count
##  1:   time 10602
##  2:   good  8847
##  3:   dont  8733
##  4:    day  8307
##  5: people  7983
##  6:   love  7964
##  7:   back  6817
##  8:   make  6594
##  9:  great  6169
## 10:   year  5748

doc_features[order(count)][1:10] #least 10 frequent words

##         name count
##  1:      ago  1563
##  2:      bit  1602
##  3:    check  1608
##  4:  started  1628
##  5:     room  1647
##  6:     stop  1664
##  7:   person  1666
##  8: business  1676
##  9:   friday  1688
## 10:   public  1691

Plot most frequent terms

ggplot(doc_features[count>5000],aes(name, count)) +
  geom_bar(stat = "identity",fill='lightblue',color='black') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_economist() + scale_color_economist()

Create word cloud

wordcloud(names(colS), colS, min.freq = 500, 
          colors = brewer.pal(6, 'Dark2'), random.order = FALSE)

wordcloud(names(colS), colS, min.freq = 2000, 
          colors = brewer.pal(6, 'Dark2'), random.order = FALSE)

end <- Sys.time()
(ellapsed <- end - start)

## Time difference of 3.019779 mins