Task 3: Modeling Milestone Report

Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.

Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.

Questions to consider:

Some words are more frequent than others - what are the distributions of word frequencies? What are the frequencies of 2-grams and 3-grams in the dataset? How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%? How do you evaluate how many of the words come from foreign languages? Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?

Review criteria:

Does the link lead to an HTML page describing the exploratory analysis of the training data set? Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables? Has the data scientist made basic plots, such as histograms to illustrate features of the data? Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?

library(fs)
library(stringr)
library(tm)
library(ggplot2)
library(ngram)
library(dplyr)
library(tidytext)
library(readr) # read_delim
library(tidyr)
library(scales)

# read in the files
USBlogs<-file("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
USNews<-file("./Coursera-SwiftKey/final/en_US/en_US.news.txt")
USTwitter<-file("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt")

# get file size
USBlogs_size<-file_size("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
USNews_size<-file_size("./Coursera-SwiftKey/final/en_US/en_US.news.txt")
USTwitter_size<-file_size("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt")

size_bind<-rbind(USBlogs_size,USNews_size, USTwitter_size)%>%
  data.frame()

# get length on each corpora
USBlogs_lines<-readLines(USBlogs, n = -1, skipNul = TRUE) # n = -1 means that read in the entire 
USNews_lines<-readLines(USNews, n = -1, skipNul = TRUE)
USTwitter_lines<-readLines(USTwitter, n = -1, skipNul = TRUE)  

USBlogs_line_length<-length(USBlogs_lines) # get length
USNews_lines_length<-length(USNews_lines) # get length
USTwitter_lines_length<-length(USTwitter_lines) # get length

lines_bind<-rbind(USBlogs_line_length, USNews_lines_length, USTwitter_lines_length)%>%
  data.frame()

# count words; use the function word count in ngram package
USBlogs_word<-wordcount(USBlogs_lines, sep = " ")
USNews_word<-wordcount(USNews_lines, sep = " ")
USTwitter_word<-wordcount(USTwitter_lines, sep = " ")

word_bind<-rbind(USBlogs_word, USNews_word, USTwitter_word)%>%
  data.frame()

# number of characters
USBlogs_nchar<-sum(nchar(USBlogs_lines))
USNews_nchar<-sum(nchar(USNews_lines))
USTwitter_nchar<-sum(nchar(USTwitter_lines))

char_bind<-rbind(USBlogs_nchar, USNews_nchar, USTwitter_nchar)%>%
  data.frame()

# get naming conventions
data<-data.frame(file = c("USBlogs", " USNews", "USTwitter"))

# merge the data by column
data_combine<-cbind(data$file, size_bind$..Coursera.SwiftKey.final.en_US.en_US.blogs.txt, lines_bind$., word_bind$., char_bind$.)%>%
  data.frame()

# get colnames
colnames<-c("file", "size", "total_lines", "word_count","number_character")
colnames(data_combine)<-colnames

data_combine$size<-as.integer(data_combine$size)
data_combine$total_lines<-as.integer(data_combine$total_lines)
data_combine$word_count<-as.integer(data_combine$word_count)
data_combine$number_character<-as.integer(data_combine$number_character)

data_combine
##        file      size total_lines word_count number_character
## 1   USBlogs 210160014      899288   37334131        206824509
## 2    USNews 205811889       77259    2643969         15639408
## 3 USTwitter 167105338     2360148   30373583        162122861
# reference one of the corpora first and get insight on it
# get the top words that are used; # lower the words; remove multiple white space; removing non alpha characters; remove space and tabs
# look ahead on this one and one will find there are symbol of words that need to be converted to ASCII character
USBlogs_data<-data.frame(content = USBlogs_lines)%>%
  mutate(content = tolower(content))%>%
  mutate(content = iconv(content, to = "ASCII//TRANSLIT"))%>%
  mutate(content = gsub("[^[:alpha:]]", " ", content))%>%
  mutate(content = gsub("\\s+", " ", content))%>%
  mutate(content = gsub("^\\s*","", content))

data("stop_words") # want to use this to remove common word frequency

# transform the data and remove common words
# get rid of stop words when analyzing text; analyzing and knowing about stop words (common words) would not help
USBlogs_data_transform<-USBlogs_data%>%
  unnest_tokens(word, content, token = "words")%>%
  anti_join(stop_words, by = "word")

# count the frequency of most occurrence
USBlogs_data_transform_count<-USBlogs_data_transform%>%
  group_by(word)%>%
  summarise(n = n())%>%
  arrange(desc(n))%>%
  data.frame()

# here we see that some words are not in the English dictionary, such as "tt"
head(USBlogs_data_transform_count, 20)
##      word     n
## 1    time 91030
## 2  people 61159
## 3     don 56765
## 4     day 52755
## 5      ve 47757
## 6    love 45334
## 7    life 41801
## 8   world 30701
## 9      ll 30302
## 10   book 28409
## 11   home 27999
## 12   week 27532
## 13   didn 27441
## 14    god 25547
## 15   feel 24454
## 16    lot 21605
## 17   read 21400
## 18 family 20520
## 19   days 20183
## 20   blog 19769
# non-english words:
non_english<-data.frame(word = c("ts", "tt", "tm", "ll"))

USBlogs_data_transform_count_dictionary<-USBlogs_data_transform_count%>%
    anti_join(non_english, by = "word")

# plot the top 20 words
p<-ggplot(USBlogs_data_transform_count_dictionary[1:20,])+
  geom_col(aes(x = n, y= word)) +
  labs(title = "US Blogs Top 20 Word and Its Frequency",
       x = "Frequency")

p

# create a bi-gram for text corpora 
# combine the whole text, but take a sample of the corpora; i.e. 10 percent of corpora; do the cumulative distribution graph 

corpora<-bind_rows(data.frame(content = USBlogs_lines),
                 data.frame(content = USNews_lines),
                 data.frame(content = USTwitter_lines))

set.seed(1313)
sample_data<-sample_n(corpora, nrow(corpora) * (.1))

clean_sample_data<-sample_data%>%
  mutate(content = tolower(content))%>%
  mutate(content = iconv(content, to = "ASCII//TRANSLIT"))%>%
  mutate(content = gsub("[^[:alpha:]]", " ", content))%>%
  mutate(content = gsub("\\s+", " ", content))%>%
  mutate(content = gsub("^\\s*","", content))%>%
  data.frame()


# read in swear words and remove swear words in the twitter file
swearWords<-read_delim("./swearWords.csv", delim = "\n", col_names = FALSE)%>%
  data.frame()%>%
  rename(word = X1)

swearWords_transform<-swearWords%>%
  mutate(word = gsub(",", " ", word))%>%
  unnest_tokens(word, word, token= "words")

# Non-English word to get rid of
non_eng<-data.frame(word = c("ts", "tt", "tm", "ll", "st", "rt"))

# here get the amount of words to cover the corpora
clean_sample_data_removewords<-clean_sample_data%>%
  unnest_tokens(word, content, token = "words")%>%
  anti_join(stop_words, by = "word")%>%
  anti_join(swearWords_transform, by = "word")%>%
  anti_join(non_eng, by = "word")%>%
  group_by(word)%>%
  summarise(n = n())%>%
  arrange(desc(n))%>%
  data.frame()

# add column proportion; need to find out of 50% and 90% coverage  
clean_sample_data_removewords_prop<-clean_sample_data_removewords%>%
  mutate(proportions = as.double(n/sum(n)))

# get the amount of words to cover 50% of corpora
clean_sample_data_removewords_prop_50<-clean_sample_data_removewords_prop%>%
  mutate(cumulative = cumsum(proportions))%>%
  filter(cumulative <= 0.50)

# reveal the top 20 words
head(clean_sample_data_removewords_prop_50,20)
##       word     n proportions  cumulative
## 1     time 17199 0.006365358 0.006365358
## 2     love 15209 0.005628858 0.011994216
## 3      day 15106 0.005590738 0.017584954
## 4      don 13661 0.005055943 0.022640896
## 5   people 11812 0.004371627 0.027012523
## 6       ve  8315 0.003077385 0.030089909
## 7     life  7657 0.002833859 0.032923768
## 8      lol  7311 0.002705805 0.035629572
## 9    night  6326 0.002341256 0.037970828
## 10   happy  6275 0.002322380 0.040293208
## 11    week  5924 0.002192475 0.042485684
## 12    home  5481 0.002028521 0.044514204
## 13   world  5360 0.001983739 0.046497943
## 14  follow  5346 0.001978557 0.048476500
## 15    feel  5100 0.001887512 0.050364012
## 16    hope  5080 0.001880110 0.052244123
## 17 tonight  4990 0.001846801 0.054090924
## 18    didn  4726 0.001749095 0.055840019
## 19  school  4196 0.001552942 0.057392960
## 20    book  4101 0.001517782 0.058910742
# get numbers of word to cover 50 percent of corpora
nrow(clean_sample_data_removewords_prop_50) # these are distinct words
## [1] 1400
# get the amount of words to cover 90% of corpora
clean_sample_data_removewords_prop_90<-clean_sample_data_removewords_prop%>%
  mutate(cumulative = cumsum(proportions))%>%
  filter(cumulative <= 0.90)

# get numbers of word to cover 50 percent of corpora
nrow(clean_sample_data_removewords_prop_90) # these are distinct words
## [1] 18368
# plot the numbers of words to cover the corpora
clean_sample_data_removewords_prop_plot<-clean_sample_data_removewords_prop%>%
  mutate(cumulative = cumsum(proportions) * 100)

# creating a column of distinct words, such as similar to cumulative sum
clean_sample_data_removewords_prop_plot$cum_word<-1:nrow(clean_sample_data_removewords_prop_plot)

# want to include 50% and 90% coverage for label in graph
clean_sample_data_removewords_prop_plot_subset<-clean_sample_data_removewords_prop_plot%>%
  subset(cum_word == 1400 | cum_word == 18368)

a<-ggplot(data = clean_sample_data_removewords_prop_plot, aes(x = cum_word, y = cumulative, group=1)) +
  geom_line() +
  geom_point(data = clean_sample_data_removewords_prop_plot_subset, colour = "blue", size = 2) +
  geom_text(data = clean_sample_data_removewords_prop_plot_subset, aes(label = cum_word), hjust = 1) +
  labs(title = "Number of Distinct Word to Cover Corpora",
       x = "Number Distinct Word in Thousands",
       y = "Proportion of Coverage") +
  scale_x_continuous(labels = unit_format(unit = "T", scale = 1e-3))

a

# create the bigram from corpora sample
bigram<-clean_sample_data%>%
  unnest_tokens(two_words, content, token = "ngrams", n = 2)

# get frequency of words in bigrams
bigram_count<-bigram%>%
  group_by(two_words)%>%
  summarise(freq = n())%>%
  arrange(desc(freq))%>%
  data.frame()

# graph distribution of Bigram
ggplot(data = bigram_count[1:20, ]) +
  geom_col(aes(x = freq, y = two_words)) + 
  labs(title = "Corpora Bigram Top 20 Phrases and Their Frequency",
       y = "Bigram",
       x = "Number of Occurance")

# create the trigram from corpora sample
trigram<-clean_sample_data%>%
  unnest_tokens(three_words, content, token = "ngrams", n = 3)

trigram_count<-trigram%>%
  group_by(three_words)%>%
  summarise(freq = n())%>%
  arrange(desc(freq))%>%
  data.frame()

# get rid of NA
trigram_count<-trigram_count%>%
  filter(!is.na(three_words))

# graph distribution of trigram
ggplot(data = trigram_count[1:20, ]) +
  geom_col(aes(x = freq, y = three_words)) + 
  labs(title = "Corpora Trigram Top 20 Phrases and Their Frequency",
       y = "Trigram",
       x = "Number of Occurance")

# create the quadgram from corpora sample
quadgram<-clean_sample_data%>%
  unnest_tokens(four_words, content, token = "ngrams", n = 4)

quadgram_count<-quadgram%>%
  group_by(four_words)%>%
  summarise(freq = n())%>%
  arrange(desc(freq))%>%
  data.frame()

quadgram_count<-quadgram_count%>%
  filter(!is.na(four_words))

# graph distribution of quadgram
ggplot(data = quadgram_count[1:20, ]) +
  geom_col(aes(x = freq, y = four_words)) + 
  labs(title = "Corpora Quadgram Top 20 Phrases and Their Frequency",
       y = "Quadgram",
       x = "Number of Occurance")