Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.
Some words are more frequent than others - what are the distributions of word frequencies? What are the frequencies of 2-grams and 3-grams in the dataset? How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%? How do you evaluate how many of the words come from foreign languages? Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?
Does the link lead to an HTML page describing the exploratory analysis of the training data set? Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables? Has the data scientist made basic plots, such as histograms to illustrate features of the data? Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?
library(fs)
library(stringr)
library(tm)
library(ggplot2)
library(ngram)
library(dplyr)
library(tidytext)
library(readr) # read_delim
library(tidyr)
library(scales)
# read in the files
USBlogs<-file("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
USNews<-file("./Coursera-SwiftKey/final/en_US/en_US.news.txt")
USTwitter<-file("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
# get file size
USBlogs_size<-file_size("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
USNews_size<-file_size("./Coursera-SwiftKey/final/en_US/en_US.news.txt")
USTwitter_size<-file_size("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
size_bind<-rbind(USBlogs_size,USNews_size, USTwitter_size)%>%
data.frame()
# get length on each corpora
USBlogs_lines<-readLines(USBlogs, n = -1, skipNul = TRUE) # n = -1 means that read in the entire
USNews_lines<-readLines(USNews, n = -1, skipNul = TRUE)
USTwitter_lines<-readLines(USTwitter, n = -1, skipNul = TRUE)
USBlogs_line_length<-length(USBlogs_lines) # get length
USNews_lines_length<-length(USNews_lines) # get length
USTwitter_lines_length<-length(USTwitter_lines) # get length
lines_bind<-rbind(USBlogs_line_length, USNews_lines_length, USTwitter_lines_length)%>%
data.frame()
# count words; use the function word count in ngram package
USBlogs_word<-wordcount(USBlogs_lines, sep = " ")
USNews_word<-wordcount(USNews_lines, sep = " ")
USTwitter_word<-wordcount(USTwitter_lines, sep = " ")
word_bind<-rbind(USBlogs_word, USNews_word, USTwitter_word)%>%
data.frame()
# number of characters
USBlogs_nchar<-sum(nchar(USBlogs_lines))
USNews_nchar<-sum(nchar(USNews_lines))
USTwitter_nchar<-sum(nchar(USTwitter_lines))
char_bind<-rbind(USBlogs_nchar, USNews_nchar, USTwitter_nchar)%>%
data.frame()
# get naming conventions
data<-data.frame(file = c("USBlogs", " USNews", "USTwitter"))
# merge the data by column
data_combine<-cbind(data$file, size_bind$..Coursera.SwiftKey.final.en_US.en_US.blogs.txt, lines_bind$., word_bind$., char_bind$.)%>%
data.frame()
# get colnames
colnames<-c("file", "size", "total_lines", "word_count","number_character")
colnames(data_combine)<-colnames
data_combine$size<-as.integer(data_combine$size)
data_combine$total_lines<-as.integer(data_combine$total_lines)
data_combine$word_count<-as.integer(data_combine$word_count)
data_combine$number_character<-as.integer(data_combine$number_character)
data_combine
## file size total_lines word_count number_character
## 1 USBlogs 210160014 899288 37334131 206824509
## 2 USNews 205811889 77259 2643969 15639408
## 3 USTwitter 167105338 2360148 30373583 162122861
# reference one of the corpora first and get insight on it
# get the top words that are used; # lower the words; remove multiple white space; removing non alpha characters; remove space and tabs
# look ahead on this one and one will find there are symbol of words that need to be converted to ASCII character
USBlogs_data<-data.frame(content = USBlogs_lines)%>%
mutate(content = tolower(content))%>%
mutate(content = iconv(content, to = "ASCII//TRANSLIT"))%>%
mutate(content = gsub("[^[:alpha:]]", " ", content))%>%
mutate(content = gsub("\\s+", " ", content))%>%
mutate(content = gsub("^\\s*","", content))
data("stop_words") # want to use this to remove common word frequency
# transform the data and remove common words
# get rid of stop words when analyzing text; analyzing and knowing about stop words (common words) would not help
USBlogs_data_transform<-USBlogs_data%>%
unnest_tokens(word, content, token = "words")%>%
anti_join(stop_words, by = "word")
# count the frequency of most occurrence
USBlogs_data_transform_count<-USBlogs_data_transform%>%
group_by(word)%>%
summarise(n = n())%>%
arrange(desc(n))%>%
data.frame()
# here we see that some words are not in the English dictionary, such as "tt"
head(USBlogs_data_transform_count, 20)
## word n
## 1 time 91030
## 2 people 61159
## 3 don 56765
## 4 day 52755
## 5 ve 47757
## 6 love 45334
## 7 life 41801
## 8 world 30701
## 9 ll 30302
## 10 book 28409
## 11 home 27999
## 12 week 27532
## 13 didn 27441
## 14 god 25547
## 15 feel 24454
## 16 lot 21605
## 17 read 21400
## 18 family 20520
## 19 days 20183
## 20 blog 19769
# non-english words:
non_english<-data.frame(word = c("ts", "tt", "tm", "ll"))
USBlogs_data_transform_count_dictionary<-USBlogs_data_transform_count%>%
anti_join(non_english, by = "word")
# plot the top 20 words
p<-ggplot(USBlogs_data_transform_count_dictionary[1:20,])+
geom_col(aes(x = n, y= word)) +
labs(title = "US Blogs Top 20 Word and Its Frequency",
x = "Frequency")
p
# create a bi-gram for text corpora
# combine the whole text, but take a sample of the corpora; i.e. 10 percent of corpora; do the cumulative distribution graph
corpora<-bind_rows(data.frame(content = USBlogs_lines),
data.frame(content = USNews_lines),
data.frame(content = USTwitter_lines))
set.seed(1313)
sample_data<-sample_n(corpora, nrow(corpora) * (.1))
clean_sample_data<-sample_data%>%
mutate(content = tolower(content))%>%
mutate(content = iconv(content, to = "ASCII//TRANSLIT"))%>%
mutate(content = gsub("[^[:alpha:]]", " ", content))%>%
mutate(content = gsub("\\s+", " ", content))%>%
mutate(content = gsub("^\\s*","", content))%>%
data.frame()
# read in swear words and remove swear words in the twitter file
swearWords<-read_delim("./swearWords.csv", delim = "\n", col_names = FALSE)%>%
data.frame()%>%
rename(word = X1)
swearWords_transform<-swearWords%>%
mutate(word = gsub(",", " ", word))%>%
unnest_tokens(word, word, token= "words")
# Non-English word to get rid of
non_eng<-data.frame(word = c("ts", "tt", "tm", "ll", "st", "rt"))
# here get the amount of words to cover the corpora
clean_sample_data_removewords<-clean_sample_data%>%
unnest_tokens(word, content, token = "words")%>%
anti_join(stop_words, by = "word")%>%
anti_join(swearWords_transform, by = "word")%>%
anti_join(non_eng, by = "word")%>%
group_by(word)%>%
summarise(n = n())%>%
arrange(desc(n))%>%
data.frame()
# add column proportion; need to find out of 50% and 90% coverage
clean_sample_data_removewords_prop<-clean_sample_data_removewords%>%
mutate(proportions = as.double(n/sum(n)))
# get the amount of words to cover 50% of corpora
clean_sample_data_removewords_prop_50<-clean_sample_data_removewords_prop%>%
mutate(cumulative = cumsum(proportions))%>%
filter(cumulative <= 0.50)
# reveal the top 20 words
head(clean_sample_data_removewords_prop_50,20)
## word n proportions cumulative
## 1 time 17199 0.006365358 0.006365358
## 2 love 15209 0.005628858 0.011994216
## 3 day 15106 0.005590738 0.017584954
## 4 don 13661 0.005055943 0.022640896
## 5 people 11812 0.004371627 0.027012523
## 6 ve 8315 0.003077385 0.030089909
## 7 life 7657 0.002833859 0.032923768
## 8 lol 7311 0.002705805 0.035629572
## 9 night 6326 0.002341256 0.037970828
## 10 happy 6275 0.002322380 0.040293208
## 11 week 5924 0.002192475 0.042485684
## 12 home 5481 0.002028521 0.044514204
## 13 world 5360 0.001983739 0.046497943
## 14 follow 5346 0.001978557 0.048476500
## 15 feel 5100 0.001887512 0.050364012
## 16 hope 5080 0.001880110 0.052244123
## 17 tonight 4990 0.001846801 0.054090924
## 18 didn 4726 0.001749095 0.055840019
## 19 school 4196 0.001552942 0.057392960
## 20 book 4101 0.001517782 0.058910742
# get numbers of word to cover 50 percent of corpora
nrow(clean_sample_data_removewords_prop_50) # these are distinct words
## [1] 1400
# get the amount of words to cover 90% of corpora
clean_sample_data_removewords_prop_90<-clean_sample_data_removewords_prop%>%
mutate(cumulative = cumsum(proportions))%>%
filter(cumulative <= 0.90)
# get numbers of word to cover 50 percent of corpora
nrow(clean_sample_data_removewords_prop_90) # these are distinct words
## [1] 18368
# plot the numbers of words to cover the corpora
clean_sample_data_removewords_prop_plot<-clean_sample_data_removewords_prop%>%
mutate(cumulative = cumsum(proportions) * 100)
# creating a column of distinct words, such as similar to cumulative sum
clean_sample_data_removewords_prop_plot$cum_word<-1:nrow(clean_sample_data_removewords_prop_plot)
# want to include 50% and 90% coverage for label in graph
clean_sample_data_removewords_prop_plot_subset<-clean_sample_data_removewords_prop_plot%>%
subset(cum_word == 1400 | cum_word == 18368)
a<-ggplot(data = clean_sample_data_removewords_prop_plot, aes(x = cum_word, y = cumulative, group=1)) +
geom_line() +
geom_point(data = clean_sample_data_removewords_prop_plot_subset, colour = "blue", size = 2) +
geom_text(data = clean_sample_data_removewords_prop_plot_subset, aes(label = cum_word), hjust = 1) +
labs(title = "Number of Distinct Word to Cover Corpora",
x = "Number Distinct Word in Thousands",
y = "Proportion of Coverage") +
scale_x_continuous(labels = unit_format(unit = "T", scale = 1e-3))
a
# create the bigram from corpora sample
bigram<-clean_sample_data%>%
unnest_tokens(two_words, content, token = "ngrams", n = 2)
# get frequency of words in bigrams
bigram_count<-bigram%>%
group_by(two_words)%>%
summarise(freq = n())%>%
arrange(desc(freq))%>%
data.frame()
# graph distribution of Bigram
ggplot(data = bigram_count[1:20, ]) +
geom_col(aes(x = freq, y = two_words)) +
labs(title = "Corpora Bigram Top 20 Phrases and Their Frequency",
y = "Bigram",
x = "Number of Occurance")
# create the trigram from corpora sample
trigram<-clean_sample_data%>%
unnest_tokens(three_words, content, token = "ngrams", n = 3)
trigram_count<-trigram%>%
group_by(three_words)%>%
summarise(freq = n())%>%
arrange(desc(freq))%>%
data.frame()
# get rid of NA
trigram_count<-trigram_count%>%
filter(!is.na(three_words))
# graph distribution of trigram
ggplot(data = trigram_count[1:20, ]) +
geom_col(aes(x = freq, y = three_words)) +
labs(title = "Corpora Trigram Top 20 Phrases and Their Frequency",
y = "Trigram",
x = "Number of Occurance")
# create the quadgram from corpora sample
quadgram<-clean_sample_data%>%
unnest_tokens(four_words, content, token = "ngrams", n = 4)
quadgram_count<-quadgram%>%
group_by(four_words)%>%
summarise(freq = n())%>%
arrange(desc(freq))%>%
data.frame()
quadgram_count<-quadgram_count%>%
filter(!is.na(four_words))
# graph distribution of quadgram
ggplot(data = quadgram_count[1:20, ]) +
geom_col(aes(x = freq, y = four_words)) +
labs(title = "Corpora Quadgram Top 20 Phrases and Their Frequency",
y = "Quadgram",
x = "Number of Occurance")