Data Science Capstone

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs ( http://rpubs.com/ ) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Introduction

We will start running the necessary libraries. We will be using mainly the TM Package and Quanteda package to employ most of the Text Mining Functions

# load necessary libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.3.2
## Package version: 3.3.1
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
## 
##     stopwords
## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-
# Look which is the working directory
getwd()
## [1] "C:/Users/Humberto/Documents/Machine Learning"
# Extract data from TXT files
blogs <- readLines("Coursera Swift Key/en_US.blogs.txt")
news <- readLines("Coursera Swift Key/en_US.news.txt")
## Warning in readLines("Coursera Swift Key/en_US.news.txt"): incomplete final
## line found on 'Coursera Swift Key/en_US.news.txt'
twitter <- readLines("Coursera Swift Key/en_US.twitter.txt")
## Warning in readLines("Coursera Swift Key/en_US.twitter.txt"): line 167155
## appears to contain an embedded nul
## Warning in readLines("Coursera Swift Key/en_US.twitter.txt"): line 268547
## appears to contain an embedded nul
## Warning in readLines("Coursera Swift Key/en_US.twitter.txt"): line 1274086
## appears to contain an embedded nul
## Warning in readLines("Coursera Swift Key/en_US.twitter.txt"): line 1759032
## appears to contain an embedded nul

Functions

Here we create the proper functions to clean data, remove spaces, extra words, remove stop words, convert strings to lowercase and so on.

# Dataset Summaries

filestats <- function(text_file, lines){
  filesize <- file.info(text_file)[1]/1024^2 # Extract file size and convert to MB
  nchars <- lapply(lines, nchar) # Calculates number of characters
  maxchars <- which.max(nchars)
  wordcount <- sum(sapply(strsplit(lines, "//s+"), length)) # Total word count, then split lines into words
  return(c(text_file, format(round(as.double(filesize),2), nsmall= 2), length(lines), maxchars, wordcount))
}

blogstat <- filestats("Coursera Swift Key/en_US.blogs.txt", blogs)
newstat <- filestats("Coursera Swift Key/en_US.news.txt", news)
twitterstat <- filestats("Coursera Swift Key/en_US.twitter.txt", twitter)

testsumm <- c(blogstat, newstat, twitterstat)
df <- data.frame(matrix(unlist(testsumm), nrow = 3, byrow = T))
colnames(df) <- c("text_file", "size(Mb)", "line_count", "max line length", "words count")
print(df)
##                              text_file size(Mb) line_count max line length
## 1   Coursera Swift Key/en_US.blogs.txt   200.42     899288          483415
## 2    Coursera Swift Key/en_US.news.txt   196.28      77259           14556
## 3 Coursera Swift Key/en_US.twitter.txt   159.36    2360148         1105776
##   words count
## 1      899347
## 2       77260
## 3     2360169

Cleaning the data

# Cleaning the data

make_corpus <- function(test_file){
  gen_corp <- paste(test_file, collapse = " ")
  gen_corp<- VectorSource(gen_corp)
  gen_corp <- Corpus(gen_corp)
}

clean_corp <- function(corp_data){
  corp_data <- tm_map(corp_data, removeNumbers) # Remove Numbers
  corp_data <- tm_map(corp_data, content_transformer(tolower)) #All Chars to lower
  corp_data<- tm_map(corp_data, removeWords, stopwords("english")) # Remove commons stop words
  corp_data<- tm_map(corp_data, removePunctuation) 
  corp_data<- tm_map(corp_data, stripWhitespace) # Strips extra whitespaces
  return (corp_data)
}

Functions

# Function to calculate high frequency words

high_freq_words <-  function(corp_data){
  term_sparse <- DocumentTermMatrix(corp_data)
  #convert our term-document-matrix into normal matrix
  term_matrix <- as.matrix(term_sparse)
  freq_words <- colSums(term_matrix)
  freq_words <- as.data.frame(sort(freq_words, decreasing = TRUE))
  freq_words$words <- rownames(freq_words)
  colnames(freq_words) <- c("frequency", "word")
  return(freq_words)
}

# Bar Chart of High Frequency words

news_text <- sample(news, round(0.1*length(news)), replace = F)
news_corp <- make_corpus(news_text)
news_corp <- clean_corp(news_corp)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
news_high_freq_words <- high_freq_words(news_corp)
news_high_freq_words1 <- news_high_freq_words[1:15, ]

ggplot(data = news_high_freq_words1, aes(x= reorder(word, frequency), y = frequency, fill = factor(reorder(word, -frequency)))) + 
  geom_bar(stat = "identity", fill="darkblue") + 
  labs(x = "Words", main = "Most frequent words used in US News", y = "Frequency", title="Frequency Chart") + 
  theme(legend.title = element_blank()) + 
  coord_flip() +
  theme_bw()

# High frequency words in blogs
blogs_text <- sample(blogs, round(0.1*length(blogs)), replace = F)
blogs_corp <- make_corpus(blogs_text)
blogs_corp<- clean_corp(blogs_corp)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
blogs_high_freq_words <- high_freq_words(blogs_corp)
blogs_high_freq_words1 <- blogs_high_freq_words[1:15, ] # Most Frequenct 15

  ggplot(data= blogs_high_freq_words1, aes(x= reorder(word, frequency), y = frequency, fill = factor(reorder(word, -frequency)))) + 
  geom_bar(stat = "identity", fill="darkblue") + 
  labs(x= "words", title = "Most frequent words in US blogs", y="Frequency") + 
  theme(legend.title = element_blank()) +
  coord_flip() +
  theme_bw()

  # High frequency words in twitter
  twitter_text <- sample(twitter, round(0.1*length(twitter)), replace = F)
  twitter_corp <- make_corpus(twitter_text)
  twitter_corp<- clean_corp(twitter_corp)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
  twitter_high_freq_words <- high_freq_words(twitter_corp)
  twitter_high_freq_words1 <- twitter_high_freq_words[1:15, ]
  
  ggplot(data= twitter_high_freq_words1, aes(x= reorder(word, frequency), y = frequency, fill = factor(reorder(word, -frequency)))) +
  geom_bar(stat = "identity", fill="darkblue") + 
  labs(x= "words", title = "Most frequent words in US twitter", y="Frequency") + 
  theme(legend.title = element_blank()) + 
  coord_flip() + 
  theme_bw()

Wordclouds

Here we are showing the 3 wordclouds, which show the most frequent words for each words corpse.

  ## US news word cloud
  wordcloud(news_high_freq_words$word[1:100], news_high_freq_words$frequency[1:100], 
            colors = brewer.pal(9, "Dark2"))
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

  ## US blogs word cloud
  wordcloud(blogs_high_freq_words$word[1:100], blogs_high_freq_words$frequency[1:100], 
            colors = brewer.pal(9, "Dark2"))
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

  ## US twitter word cloud
  wordcloud(twitter_high_freq_words$word[1:100], twitter_high_freq_words$frequency[1:100], 
            colors = brewer.pal(9, "Dark2"))
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

Word Analysis

On this part, we sample 10% of the words from all data, in order to use it for testing purposes.

# Word Analysis
  
  news_text1 <- sample(news_text, round(0.1*length(news_text)), replace = F)
  news_tokens <- tokens(news_text1, what = "word", 
                        remove_numbers = TRUE, 
                        remove_punct = TRUE, 
                        remove_separators = TRUE, 
                        remove_symbols = TRUE)
  
  news_tokens <- tokens_tolower(news_tokens)
  news_tokens <- tokens_select(news_tokens, stopwords(), selection = "remove")
  
  news_unigram <- tokens_ngrams(news_tokens, n = 1) ## unigram
  news_unigram.dfm <- dfm(news_unigram, tolower = TRUE, 
                          remove = stopwords("english"), 
                          remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
  news_bigram <- tokens_ngrams(news_tokens, n = 2) ## bigram
  news_bigram.dfm <- dfm(news_bigram, tolower = TRUE, 
                         remove = stopwords("english"), 
                         remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
  news_trigram <- tokens_ngrams(news_tokens, n = 3) ## trigram
  news_trigram.dfm <- dfm(news_trigram, tolower = TRUE, 
                          remove = stopwords("english"), 
                          remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
## 20 top unigram words of news
topfeatures(news_unigram.dfm, 20) 
##    said     one     can     new    like   first    also    last    just    year 
##     166      68      58      58      51      49      48      47      44      43 
##      --   years   state  school    city million      st    time  people    good 
##      43      38      37      37      36      36      35      34      32      30
## 20 top bigram words of news
topfeatures(news_bigram.dfm, n= 20) 
##        st_louis       last_year      new_jersey      h_jpmorgan     high_school 
##              17               9               8               7               7 
##     jpmorgan_us        new_york   san_francisco     los_angeles      mmf_morgan 
##               6               6               6               6               5 
## jpmorgan_liquid   liquid_assets      assets_mmf          mmf_cl       make_sure 
##               5               5               5               5               5 
##      little_bit       last_week   united_states       sales_tax       years_ago 
##               5               5               5               4               4
## 20 top trigram words of news
topfeatures(news_trigram.dfm, n= 20) 
## jpmorgan_liquid_assets      liquid_assets_mmf    mmf_morgan_jpmorgan 
##                      5                      5                      4 
##           r_h_jpmorgan     jpmorgan_prime_mmf          h_jpmorgan_us 
##                      4                      4                      4 
##          mmf_service_h     service_h_jpmorgan       jpmorgan_us_govt 
##                      3                      3                      3 
##            us_govt_mmf      jpmorgan_us_treas          us_treas_plus 
##                      3                      3                      3 
##         treas_plus_mmf     andy_griffith_show         four_years_ago 
##                      3                      2                      2 
##         years_ago_said morgan_jpmorgan_liquid               mmf_cl_c 
##                      2                      2                      2 
##          cl_c_jpmorgan              mmf_inv_r 
##                      2                      2
# --------------------------- Blogs ------------------------------------------

blogs_text1 <- sample(blogs_text, round(0.1*length(blogs_text)), replace = F)
blogs_tokens <- tokens(blogs_text1, what = "word", remove_numbers = TRUE, remove_punct = TRUE, remove_separators = TRUE, remove_symbols = TRUE)
blogs_tokens <- tokens_tolower(blogs_tokens)
blogs_tokens <- tokens_select(blogs_tokens, stopwords(), selection = "remove")

blogs_unigram <- tokens_ngrams(blogs_tokens, n = 1) ## unigram
blogs_unigram.dfm <- dfm(blogs_unigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
blogs_bigram <- tokens_ngrams(blogs_tokens, n = 2) ## bigram
blogs_bigram.dfm <- dfm(blogs_bigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
blogs_trigram <- tokens_ngrams(blogs_tokens, n = 3) ## trigram
blogs_trigram.dfm <- dfm(blogs_trigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
## 20 top unigram words of blogs
topfeatures(blogs_unigram.dfm, 20) 
##    one    can   just   like   time    get people    now   know    new   also 
##   1157   1016   1016    963    875    695    618    612    576    555    551 
##   back   even   good     us    day  first   make    way   much 
##    533    532    520    517    504    498    498    490    475
## 20 top bigram words of blogs
topfeatures(blogs_bigram.dfm, n= 20) 
##     can_see   right_now  first_time even_though    new_york   last_week 
##          49          49          48          48          48          44 
##   last_year   years_ago   make_sure  don’t_know   every_day    year_old 
##          43          41          39          38          38          35 
## pretty_much many_people don’t_think   feel_like high_school     can_get 
##          31          31          31          31          30          28 
##   felt_like     go_back 
##          27          27
## 20 top trigram words of blogs
topfeatures(blogs_trigram.dfm, n= 20) 
##          new_york_city        first_time_ever            let_us_know 
##                      9                      8                      7 
##       couple_weeks_ago         new_york_times        don’t_get_wrong 
##                      6                      6                      6 
##            long_way_go         still_long_way          two_years_ago 
##                      6                      5                      5 
##           year_old_son           new_york_n.y           17th_day_2nd 
##                      4                      4                      4 
##          day_2nd_month         spent_lot_time    amazon_services_llc 
##                      4                      4                      4 
##    services_llc_amazon          llc_amazon_eu          level_mp_cost 
##                      4                      4                      4 
## williston_north_dakota   north_dakota_rentals 
##                      4                      4
# ------------------ Twitter -----------------------------------------

twitter_text1 <- sample(twitter_text, round(0.1*length(twitter_text)), replace = F)
twitter_tokens <- tokens(twitter_text1, what = "word", remove_numbers = TRUE, remove_punct = TRUE, remove_separators = TRUE, remove_symbols = TRUE)
twitter_tokens <- tokens_tolower(twitter_tokens)
twitter_tokens <- tokens_select(twitter_tokens, stopwords(), selection = "remove")

twitter_unigram <- tokens_ngrams(twitter_tokens, n = 1) ## unigram
twitter_unigram.dfm <- dfm(twitter_unigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
twitter_bigram <- tokens_ngrams(twitter_tokens, n = 2) ## bigram
twitter_bigram.dfm <- dfm(twitter_bigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
twitter_trigram <- tokens_ngrams(twitter_tokens, n = 3) ## trigram
twitter_trigram.dfm <- dfm(twitter_trigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
## 20 top unigram words of twitter
topfeatures(twitter_unigram.dfm, 20) 
##   just   like    get   love   good    can    now thanks    day     rt      u 
##   1503   1213   1100   1034    971    929    897    896    886    858    856 
##    one   time   know  today  great    lol    new    see     go 
##    805    803    799    754    743    718    705    674    671
## 20 top bigram words of twitter
topfeatures(twitter_bigram.dfm, n= 20) 
##       right_now      last_night looking_forward   thanks_follow  happy_birthday 
##             171              95              93              89              82 
##    good_morning       good_luck      looks_like        just_got       feel_like 
##              75              67              64              63              60 
##         can_get     follow_back        let_know       next_week       make_sure 
##              59              55              54              49              47 
##   please_follow           hi_hi    social_media         join_us       great_day 
##              46              46              43              42              42
## 20 top trigram words of twitter
topfeatures(twitter_trigram.dfm, n= 20) 
##               hi_hi_hi     happy_mother's_day      happy_mothers_day 
##                     45                     24                     16 
##         happy_new_year            let_us_know          cinco_de_mayo 
##                     16                     12                     11 
##     thanks_follow_hope               ha_ha_ha    look_forward_seeing 
##                      9                      9                      8 
##            bad_bad_bad          just_got_back looking_forward_seeing 
##                      8                      7                      7 
##          just_got_home       dreams_come_true         just_make_sure 
##                      6                      6                      6 
##    hope_everyone_great         cake_cake_cake         hope_great_day 
##                      6                      6                      6 
##  good_morning_everyone          cant_wait_see 
##                      6                      6