The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs ( http://rpubs.com/ ) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
We will start running the necessary libraries. We will be using mainly the TM Package and Quanteda package to employ most of the Text Mining Functions
# load necessary libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.3.2
## Package version: 3.3.1
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
##
## stopwords
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
# Look which is the working directory
getwd()
## [1] "C:/Users/Humberto/Documents/Machine Learning"
# Extract data from TXT files
blogs <- readLines("Coursera Swift Key/en_US.blogs.txt")
news <- readLines("Coursera Swift Key/en_US.news.txt")
## Warning in readLines("Coursera Swift Key/en_US.news.txt"): incomplete final
## line found on 'Coursera Swift Key/en_US.news.txt'
twitter <- readLines("Coursera Swift Key/en_US.twitter.txt")
## Warning in readLines("Coursera Swift Key/en_US.twitter.txt"): line 167155
## appears to contain an embedded nul
## Warning in readLines("Coursera Swift Key/en_US.twitter.txt"): line 268547
## appears to contain an embedded nul
## Warning in readLines("Coursera Swift Key/en_US.twitter.txt"): line 1274086
## appears to contain an embedded nul
## Warning in readLines("Coursera Swift Key/en_US.twitter.txt"): line 1759032
## appears to contain an embedded nul
Here we create the proper functions to clean data, remove spaces, extra words, remove stop words, convert strings to lowercase and so on.
# Dataset Summaries
filestats <- function(text_file, lines){
filesize <- file.info(text_file)[1]/1024^2 # Extract file size and convert to MB
nchars <- lapply(lines, nchar) # Calculates number of characters
maxchars <- which.max(nchars)
wordcount <- sum(sapply(strsplit(lines, "//s+"), length)) # Total word count, then split lines into words
return(c(text_file, format(round(as.double(filesize),2), nsmall= 2), length(lines), maxchars, wordcount))
}
blogstat <- filestats("Coursera Swift Key/en_US.blogs.txt", blogs)
newstat <- filestats("Coursera Swift Key/en_US.news.txt", news)
twitterstat <- filestats("Coursera Swift Key/en_US.twitter.txt", twitter)
testsumm <- c(blogstat, newstat, twitterstat)
df <- data.frame(matrix(unlist(testsumm), nrow = 3, byrow = T))
colnames(df) <- c("text_file", "size(Mb)", "line_count", "max line length", "words count")
print(df)
## text_file size(Mb) line_count max line length
## 1 Coursera Swift Key/en_US.blogs.txt 200.42 899288 483415
## 2 Coursera Swift Key/en_US.news.txt 196.28 77259 14556
## 3 Coursera Swift Key/en_US.twitter.txt 159.36 2360148 1105776
## words count
## 1 899347
## 2 77260
## 3 2360169
# Cleaning the data
make_corpus <- function(test_file){
gen_corp <- paste(test_file, collapse = " ")
gen_corp<- VectorSource(gen_corp)
gen_corp <- Corpus(gen_corp)
}
clean_corp <- function(corp_data){
corp_data <- tm_map(corp_data, removeNumbers) # Remove Numbers
corp_data <- tm_map(corp_data, content_transformer(tolower)) #All Chars to lower
corp_data<- tm_map(corp_data, removeWords, stopwords("english")) # Remove commons stop words
corp_data<- tm_map(corp_data, removePunctuation)
corp_data<- tm_map(corp_data, stripWhitespace) # Strips extra whitespaces
return (corp_data)
}
# Function to calculate high frequency words
high_freq_words <- function(corp_data){
term_sparse <- DocumentTermMatrix(corp_data)
#convert our term-document-matrix into normal matrix
term_matrix <- as.matrix(term_sparse)
freq_words <- colSums(term_matrix)
freq_words <- as.data.frame(sort(freq_words, decreasing = TRUE))
freq_words$words <- rownames(freq_words)
colnames(freq_words) <- c("frequency", "word")
return(freq_words)
}
# Bar Chart of High Frequency words
news_text <- sample(news, round(0.1*length(news)), replace = F)
news_corp <- make_corpus(news_text)
news_corp <- clean_corp(news_corp)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
news_high_freq_words <- high_freq_words(news_corp)
news_high_freq_words1 <- news_high_freq_words[1:15, ]
ggplot(data = news_high_freq_words1, aes(x= reorder(word, frequency), y = frequency, fill = factor(reorder(word, -frequency)))) +
geom_bar(stat = "identity", fill="darkblue") +
labs(x = "Words", main = "Most frequent words used in US News", y = "Frequency", title="Frequency Chart") +
theme(legend.title = element_blank()) +
coord_flip() +
theme_bw()
# High frequency words in blogs
blogs_text <- sample(blogs, round(0.1*length(blogs)), replace = F)
blogs_corp <- make_corpus(blogs_text)
blogs_corp<- clean_corp(blogs_corp)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
blogs_high_freq_words <- high_freq_words(blogs_corp)
blogs_high_freq_words1 <- blogs_high_freq_words[1:15, ] # Most Frequenct 15
ggplot(data= blogs_high_freq_words1, aes(x= reorder(word, frequency), y = frequency, fill = factor(reorder(word, -frequency)))) +
geom_bar(stat = "identity", fill="darkblue") +
labs(x= "words", title = "Most frequent words in US blogs", y="Frequency") +
theme(legend.title = element_blank()) +
coord_flip() +
theme_bw()
# High frequency words in twitter
twitter_text <- sample(twitter, round(0.1*length(twitter)), replace = F)
twitter_corp <- make_corpus(twitter_text)
twitter_corp<- clean_corp(twitter_corp)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
twitter_high_freq_words <- high_freq_words(twitter_corp)
twitter_high_freq_words1 <- twitter_high_freq_words[1:15, ]
ggplot(data= twitter_high_freq_words1, aes(x= reorder(word, frequency), y = frequency, fill = factor(reorder(word, -frequency)))) +
geom_bar(stat = "identity", fill="darkblue") +
labs(x= "words", title = "Most frequent words in US twitter", y="Frequency") +
theme(legend.title = element_blank()) +
coord_flip() +
theme_bw()
Here we are showing the 3 wordclouds, which show the most frequent words for each words corpse.
## US news word cloud
wordcloud(news_high_freq_words$word[1:100], news_high_freq_words$frequency[1:100],
colors = brewer.pal(9, "Dark2"))
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
## US blogs word cloud
wordcloud(blogs_high_freq_words$word[1:100], blogs_high_freq_words$frequency[1:100],
colors = brewer.pal(9, "Dark2"))
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
## US twitter word cloud
wordcloud(twitter_high_freq_words$word[1:100], twitter_high_freq_words$frequency[1:100],
colors = brewer.pal(9, "Dark2"))
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
On this part, we sample 10% of the words from all data, in order to use it for testing purposes.
# Word Analysis
news_text1 <- sample(news_text, round(0.1*length(news_text)), replace = F)
news_tokens <- tokens(news_text1, what = "word",
remove_numbers = TRUE,
remove_punct = TRUE,
remove_separators = TRUE,
remove_symbols = TRUE)
news_tokens <- tokens_tolower(news_tokens)
news_tokens <- tokens_select(news_tokens, stopwords(), selection = "remove")
news_unigram <- tokens_ngrams(news_tokens, n = 1) ## unigram
news_unigram.dfm <- dfm(news_unigram, tolower = TRUE,
remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
news_bigram <- tokens_ngrams(news_tokens, n = 2) ## bigram
news_bigram.dfm <- dfm(news_bigram, tolower = TRUE,
remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
news_trigram <- tokens_ngrams(news_tokens, n = 3) ## trigram
news_trigram.dfm <- dfm(news_trigram, tolower = TRUE,
remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
## 20 top unigram words of news
topfeatures(news_unigram.dfm, 20)
## said one can new like first also last just year
## 166 68 58 58 51 49 48 47 44 43
## -- years state school city million st time people good
## 43 38 37 37 36 36 35 34 32 30
## 20 top bigram words of news
topfeatures(news_bigram.dfm, n= 20)
## st_louis last_year new_jersey h_jpmorgan high_school
## 17 9 8 7 7
## jpmorgan_us new_york san_francisco los_angeles mmf_morgan
## 6 6 6 6 5
## jpmorgan_liquid liquid_assets assets_mmf mmf_cl make_sure
## 5 5 5 5 5
## little_bit last_week united_states sales_tax years_ago
## 5 5 5 4 4
## 20 top trigram words of news
topfeatures(news_trigram.dfm, n= 20)
## jpmorgan_liquid_assets liquid_assets_mmf mmf_morgan_jpmorgan
## 5 5 4
## r_h_jpmorgan jpmorgan_prime_mmf h_jpmorgan_us
## 4 4 4
## mmf_service_h service_h_jpmorgan jpmorgan_us_govt
## 3 3 3
## us_govt_mmf jpmorgan_us_treas us_treas_plus
## 3 3 3
## treas_plus_mmf andy_griffith_show four_years_ago
## 3 2 2
## years_ago_said morgan_jpmorgan_liquid mmf_cl_c
## 2 2 2
## cl_c_jpmorgan mmf_inv_r
## 2 2
# --------------------------- Blogs ------------------------------------------
blogs_text1 <- sample(blogs_text, round(0.1*length(blogs_text)), replace = F)
blogs_tokens <- tokens(blogs_text1, what = "word", remove_numbers = TRUE, remove_punct = TRUE, remove_separators = TRUE, remove_symbols = TRUE)
blogs_tokens <- tokens_tolower(blogs_tokens)
blogs_tokens <- tokens_select(blogs_tokens, stopwords(), selection = "remove")
blogs_unigram <- tokens_ngrams(blogs_tokens, n = 1) ## unigram
blogs_unigram.dfm <- dfm(blogs_unigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
blogs_bigram <- tokens_ngrams(blogs_tokens, n = 2) ## bigram
blogs_bigram.dfm <- dfm(blogs_bigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
blogs_trigram <- tokens_ngrams(blogs_tokens, n = 3) ## trigram
blogs_trigram.dfm <- dfm(blogs_trigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
## 20 top unigram words of blogs
topfeatures(blogs_unigram.dfm, 20)
## one can just like time get people now know new also
## 1157 1016 1016 963 875 695 618 612 576 555 551
## back even good us day first make way much
## 533 532 520 517 504 498 498 490 475
## 20 top bigram words of blogs
topfeatures(blogs_bigram.dfm, n= 20)
## can_see right_now first_time even_though new_york last_week
## 49 49 48 48 48 44
## last_year years_ago make_sure don’t_know every_day year_old
## 43 41 39 38 38 35
## pretty_much many_people don’t_think feel_like high_school can_get
## 31 31 31 31 30 28
## felt_like go_back
## 27 27
## 20 top trigram words of blogs
topfeatures(blogs_trigram.dfm, n= 20)
## new_york_city first_time_ever let_us_know
## 9 8 7
## couple_weeks_ago new_york_times don’t_get_wrong
## 6 6 6
## long_way_go still_long_way two_years_ago
## 6 5 5
## year_old_son new_york_n.y 17th_day_2nd
## 4 4 4
## day_2nd_month spent_lot_time amazon_services_llc
## 4 4 4
## services_llc_amazon llc_amazon_eu level_mp_cost
## 4 4 4
## williston_north_dakota north_dakota_rentals
## 4 4
# ------------------ Twitter -----------------------------------------
twitter_text1 <- sample(twitter_text, round(0.1*length(twitter_text)), replace = F)
twitter_tokens <- tokens(twitter_text1, what = "word", remove_numbers = TRUE, remove_punct = TRUE, remove_separators = TRUE, remove_symbols = TRUE)
twitter_tokens <- tokens_tolower(twitter_tokens)
twitter_tokens <- tokens_select(twitter_tokens, stopwords(), selection = "remove")
twitter_unigram <- tokens_ngrams(twitter_tokens, n = 1) ## unigram
twitter_unigram.dfm <- dfm(twitter_unigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
twitter_bigram <- tokens_ngrams(twitter_tokens, n = 2) ## bigram
twitter_bigram.dfm <- dfm(twitter_bigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
twitter_trigram <- tokens_ngrams(twitter_tokens, n = 3) ## trigram
twitter_trigram.dfm <- dfm(twitter_trigram, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
## 20 top unigram words of twitter
topfeatures(twitter_unigram.dfm, 20)
## just like get love good can now thanks day rt u
## 1503 1213 1100 1034 971 929 897 896 886 858 856
## one time know today great lol new see go
## 805 803 799 754 743 718 705 674 671
## 20 top bigram words of twitter
topfeatures(twitter_bigram.dfm, n= 20)
## right_now last_night looking_forward thanks_follow happy_birthday
## 171 95 93 89 82
## good_morning good_luck looks_like just_got feel_like
## 75 67 64 63 60
## can_get follow_back let_know next_week make_sure
## 59 55 54 49 47
## please_follow hi_hi social_media join_us great_day
## 46 46 43 42 42
## 20 top trigram words of twitter
topfeatures(twitter_trigram.dfm, n= 20)
## hi_hi_hi happy_mother's_day happy_mothers_day
## 45 24 16
## happy_new_year let_us_know cinco_de_mayo
## 16 12 11
## thanks_follow_hope ha_ha_ha look_forward_seeing
## 9 9 8
## bad_bad_bad just_got_back looking_forward_seeing
## 8 7 7
## just_got_home dreams_come_true just_make_sure
## 6 6 6
## hope_everyone_great cake_cake_cake hope_great_day
## 6 6 6
## good_morning_everyone cant_wait_see
## 6 6