Background
This is a milestone report for peer graded assigment as part of Data Science Captone Course from Coursera in Week 2. The objective of this document is as follows,
1.Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2.Create a basic report of summary statistics about the data sets. 3.Report any interesting findings that you amassed so far. Get feedback on your plans for creating a prediction algorithm and Shiny app. 4.This report also will be served as a base for creating the next assignment report, hence it should be as clear and concise as possible. The content of this report will be structured to 5 sections as per the objective mentioned above.
Fetching and Loading the Training Dataset
The training dataset to get started that will be the basis for most of the capstone. The dataset must be downloaded from the link below and not from external websites to start.
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
Load Packages
library(quanteda)
## Package version: 3.3.1
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 2 of 2 threads used.
## See https://quanteda.io for tutorials and examples.
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following objects are masked from 'package:quanteda':
##
## meta, meta<-
##
## Attaching package: 'tm'
## The following object is masked from 'package:quanteda':
##
## stopwords
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tokenizers)
library(stringi)
library(wordcloud)
## Loading required package: RColorBrewer
Download and import dataset
con <- file("en_US.news.txt", open="r")
En_US_NEWS_text <- readLines(con, skipNul = TRUE); close(con)
## Warning in readLines(con, skipNul = TRUE): incomplete final line found on
## 'en_US.news.txt'
con <- file("en_US.blogs.txt", open="r")
En_US_blogs_text <- readLines(con, skipNul = TRUE); close(con)
con <- file("en_US.twitter.txt", open="r")
En_Twit_text <- readLines(con, skipNul = TRUE); close(con)
Summary Statistics
file_stat<- function(text_file, lines) {
f_size <- file.info(text_file)[1]/1024^2
nchars <- lapply(lines, nchar)
maxchars <- which.max(nchars)
word_count <- sum(sapply(strsplit(lines, "\\s+"), length))
return(c(text_file, format(round(as.double(f_size), 2), nsmall=2), length(lines),maxchars, word_count))
}
En_US_news_stat<- file_stat("en_US.news.txt", En_US_NEWS_text)
En_US_blogs_stat <- file_stat("en_US.blogs.txt", En_US_blogs_text)
En_Twit_text_stat<- file_stat("en_US.twitter.txt", En_Twit_text)
test_summary <- c(En_US_news_stat, En_US_blogs_stat,En_Twit_text_stat)
df <- data.frame(matrix(unlist(test_summary), nrow=3, byrow=T))
colnames(df) <- c("Text_file", "Size(MB)", "Line_Count", "Max Line Length", "Words_Count")
df
## Text_file Size(MB) Line_Count Max Line Length Words_Count
## 1 en_US.news.txt 196.28 77259 14556 2643969
## 2 en_US.blogs.txt 200.42 899288 483415 37334131
## 3 en_US.twitter.txt 159.36 2360148 1105776 30373583
Exploratory data analysis:
Here will be the functions to make the test data corpus, clean the corpus, and then capture the high frequency words.
make_Corpus<- function(test_file) {
gen_corp<- paste(test_file, collapse=" ")
gen_corp <- VectorSource(gen_corp)
gen_corp <- Corpus(gen_corp)
}
clean_corp <- function(corp_data) {
corp_data <- tm_map(corp_data, removeNumbers)
corp_data <- tm_map(corp_data, content_transformer(tolower))
corp_data <- tm_map(corp_data, removeWords, stopwords("english"))
corp_data <- tm_map(corp_data, removePunctuation)
corp_data <- tm_map(corp_data, stripWhitespace)
return (corp_data)
}
high_freq_words <- function (corp_data) {
term_sparse <- DocumentTermMatrix(corp_data)
term_matrix <- as.matrix(term_sparse) ## convert our term-document-matrix into a normal matrix
freq_words <- colSums(term_matrix)
freq_words <- as.data.frame(sort(freq_words, decreasing=TRUE))
freq_words$word <- rownames(freq_words)
colnames(freq_words) <- c("Frequency","word")
return (freq_words)
}
Bar Chart of High frequency words:
This section will explore the different text mining commands and extract the high frequency words.
## en_US.news.txt High frequency words
En_US_NEWS_text1<-sample(En_US_NEWS_text, round(0.1*length(En_US_NEWS_text)), replace = F)
US_news_corpus <- make_Corpus(En_US_NEWS_text1)
US_news_corpus <- clean_corp(US_news_corpus)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
US_news_most_used_word <- high_freq_words(US_news_corpus)
US_news_most_used_word1<- US_news_most_used_word[1:15,]
p<-ggplot(data=US_news_most_used_word1,aes(x=reorder(word,Frequency),y=Frequency ,fill=as.numeric(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Word") +labs(title = "Most Frequent Words in US News") +theme(legend.position = "none") + coord_flip()
## en_US.blogs.txt High frequency words
En_US_blogs_text1<-sample(En_US_blogs_text, round(0.1*length(En_US_blogs_text)), replace = F)
US_blogs_corpus <- make_Corpus(En_US_blogs_text1)
US_blogs_corpus <- clean_corp(US_blogs_corpus)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
US_blogs_most_used_word <- high_freq_words(US_blogs_corpus)
US_blogs_most_used_word1<- US_blogs_most_used_word[1:15,]
p<-ggplot(data=US_blogs_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
fill=as.numeric(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Word") +labs(title = "Most Frequent Words in US blogs") +theme(legend.position = "none") + coord_flip()
## en_US.twitter.txt High frequency words
En_Twit_text1<-sample(En_Twit_text, round(0.1*length(En_Twit_text)), replace = F)
twitter_corpus <- make_Corpus(En_Twit_text1)
twitter_corpus <- clean_corp(twitter_corpus)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
twitter_most_used_word <- high_freq_words(twitter_corpus)
twitter_most_used_word1<- twitter_most_used_word[1:15,]
p<-ggplot(data=twitter_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
fill=as.numeric(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Word") +labs(title = "Most Frequent Words in Twitter") +theme(legend.position = "none") + coord_flip()
Using Wordcloud to compute words
## US News Word Cloud
wordcloud(US_news_most_used_word$word[1:100], US_news_most_used_word$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
## US Blogs Word Cloud
wordcloud(US_blogs_most_used_word$word[1:100], US_blogs_most_used_word$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
## Twitter Word Cloud
wordcloud(twitter_most_used_word$word[1:100], twitter_most_used_word$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
Word Analysis:
For the Data analysis of text document we need to create a bag of word matrices with Unigram, Bigram, Trigrams. These Ngram model set improve the predictabily of the data analysis.
## en_US.news.txt High frequency words
En_US_NEWS_text1<-sample(En_US_NEWS_text, round(0.01*length(En_US_NEWS_text)), replace = F)
US_News_tokens<- quanteda::tokens(En_US_NEWS_text1,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
US_News_tokens <- tokens_tolower(US_News_tokens)
US_News_tokens <- tokens_select(US_News_tokens, stopwords(),selection ="remove")
US_News_unigram <- tokens_ngrams(US_News_tokens, n=1) ## unigram
US_News_unigram.dfm <- dfm(US_News_unigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
US_News_bigram <- tokens_ngrams(US_News_tokens, n=2) ## bigram
US_News_bigram.dfm <- dfm(US_News_bigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
US_News_trigram <- tokens_ngrams(US_News_tokens, n=3) ## trigram
US_News_trigram.dfm <- dfm(US_News_trigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
topfeatures(US_News_unigram.dfm, 10) # 10 top US News Unigram words
## said will one can like also people state year new
## 173 96 76 51 49 46 43 41 40 40
topfeatures(US_News_bigram.dfm, 10) # 10 top US News Bigram words
## los_angeles st_louis right_now high_school
## 11 10 8 8
## last_year years_ago new_york associated_press
## 7 6 6 5
## looks_like school_district
## 4 4
topfeatures(US_News_trigram.dfm, 10) # 10 top US News Trigram words
## cents_per_gallon crested_butte_mountain first_quarter_year
## 3 3 2
## st_louis_county rain_barrel_use barrel_use_tubing
## 2 2 2
## butte_mountain_guides los_angeles_police returned_state_police
## 2 2 2
## treasurer_andy_dillon
## 2
## en_US.blog.txt High frequency words
En_US_blogs_text1<-sample(En_US_blogs_text, round(0.02*length(En_US_blogs_text)), replace = F)
US_blogs_tokens<-quanteda::tokens(En_US_blogs_text1,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
US_blogs_tokens <- tokens_tolower(US_blogs_tokens)
US_blogs_tokens <- tokens_select(US_blogs_tokens, stopwords(),selection ="remove")
US_blogs_unigram <- tokens_ngrams(US_blogs_tokens, n=1) ## unigram
US_blogs_unigram.dfm <- dfm(US_blogs_unigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
US_blogs_bigram <- tokens_ngrams(US_blogs_tokens, n=2) ## bigram
US_blogs_bigram.dfm <- dfm(US_blogs_bigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
US_blogs_trigram <- tokens_ngrams(US_blogs_tokens, n=3) ## tiigram
US_blogs_trigram.dfm <- dfm(US_blogs_trigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
topfeatures(US_blogs_unigram.dfm, 20) # 20 top US blogs Unigram words
## one will can just like time get know people now new
## 2561 2256 1970 1934 1866 1834 1416 1260 1206 1182 1095
## day also first even make us think much really
## 1086 1079 1064 1063 1030 1025 1012 1012 1009
topfeatures(US_blogs_bigram.dfm, 20) # 20 top US blogs Bigram words
## first_time right_now years_ago even_though feel_like new_york
## 100 96 96 86 85 82
## don’t_know can_see last_year make_sure every_day last_week
## 82 78 77 74 71 69
## one_day high_school one_thing last_night don’t_think every_time
## 67 63 62 58 57 54
## can_get many_people
## 53 52
topfeatures(US_blogs_trigram.dfm, 20) # 20 top US blogs Trigram words
## fake_fake_fake bmw_service_center
## 33 22
## service_center_california south_carolina_mortgage
## 22 13
## carolina_mortgage_refinancing north_dakota_maquest
## 13 12
## new_york_times new_york_city
## 10 10
## amazon_services_llc services_llc_amazon
## 10 10
## llc_amazon_eu couple_weeks_ago
## 10 9
## world_war_ii horse_c_k
## 8 8
## cedar_rapids_iowa please_let_know
## 8 7
## spend_much_time please_feel_free
## 7 7
## dhs_cedar_rapids just_make_sure
## 7 6
## en_US.twitter.txt Ngram words
En_Twit_text1<-sample(En_Twit_text, round(0.02*length(En_Twit_text)), replace = F)
twitter_tokens<- quanteda::tokens(En_Twit_text1,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
twitter_tokens <- tokens_tolower(twitter_tokens)
twitter_tokens <- tokens_select(twitter_tokens, stopwords(),selection ="remove")
twitter_unigram <- tokens_ngrams(twitter_tokens, n=1) ## unigram
twitter_unigram.dfm <- dfm(twitter_unigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
twitter_bigram <- tokens_ngrams(twitter_tokens, n=2) ## bigram
twitter_bigram.dfm <- dfm(twitter_bigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
twitter_trigram <- tokens_ngrams(twitter_tokens, n=3) ## trigram
twitter_trigram.dfm <- dfm(twitter_trigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
topfeatures(twitter_unigram.dfm, 10) # 10 top Unigram words
## just like get love good will thanks can day rt
## 3056 2390 2287 2263 1953 1893 1880 1822 1737 1713
topfeatures(twitter_bigram.dfm, 10) # 10 top Bigram words
## right_now last_night looking_forward happy_birthday good_morning
## 335 227 166 159 155
## thanks_follow feel_like just_got good_luck follow_back
## 147 132 131 124 113
topfeatures(twitter_trigram.dfm, 10) # 10 top Trigram words
## let_us_know happy_mothers_day happy_new_year
## 38 33 29
## cinco_de_mayo happy_mother's_day love_love_love
## 24 23 20
## yes_yes_yes looking_forward_seeing please_follow_back
## 18 17 15
## cake_cake_cake
## 14
Conclusion:
The final deliverable in the capstone project is to build a predictive algorithm that will be deployed as a Shiny app for the user interface. The Shiny app should take as input a phrase (multiple words) in a text box input and output a prediction of the next word. Priorities for the final project include: - Tokenization and bag of words with multiple Ngrams. - Using a small sample (~ 1 to 5%). - Run the Machine Learning programs to develop the predictive model. - Explore the options to improve the accuracy and speed of execution.