The Coursera Data Science Capstone aims to develop a model for Natural Language Processing, as used in applications such as SwiftKey. As a precursor to developing a robust model, this report looks at the features of a large respository of English language text, and discusses model development plans for the next stage of the project.
Data from three English datasets will be used in this analysis/model. Before any major clean-up or transformations are done, the data will be loaded and summarized for features such as size, length of each line, and number of words. Major outliers will be noted.
en_blogs <- readLines("final/en_US/en_US.blogs.txt")
en_news <- readLines("final/en_US/en_US.news.txt")
en_twitter <- readLines("final/en_US/en_US.twitter.txt", warn=FALSE)
all_stats <- stri_stats_general(c(en_blogs, en_news, en_twitter))
blogs_stats <- stri_stats_general(en_blogs)
news_stats <- stri_stats_general(en_news)
twitter_stats <- stri_stats_general(en_twitter)
blogs_wordcount <- stri_count_words(en_blogs)
news_wordcount <- stri_count_words(en_news)
twitter_wordcount <- stri_count_words(en_twitter)
textsummary <- data.frame(dataset = c("Blogs", "News", "Twitter"),
Line.Count = c(blogs_stats[1], news_stats[1], twitter_stats[1]),
Char.Count = c(blogs_stats[3], news_stats[3], twitter_stats[3]),
Char.Per.Line = c(round(blogs_stats[3]/blogs_stats[1],2), round(news_stats[3]/news_stats[1],2), round(twitter_stats[3]/twitter_stats[1],2)),
Word.Count = c(sum(blogs_wordcount), sum(news_wordcount), sum(twitter_wordcount)),
Words.Per.Line = c(round(sum(blogs_wordcount)/blogs_stats[1],2), round(sum(news_wordcount)/news_stats[1],2), round(sum(twitter_wordcount)/twitter_stats[1],2)),
Ave.Word.Length = c(round(blogs_stats[3]/sum(blogs_wordcount),2), round(news_stats[3]/sum(news_wordcount),2), round(twitter_stats[3]/sum(twitter_wordcount),2)))
print(textsummary)
## dataset Line.Count Char.Count Char.Per.Line Word.Count Words.Per.Line
## 1 Blogs 899288 206824382 229.99 37546246 41.75
## 2 News 1010242 203223154 201.16 34762395 34.41
## 3 Twitter 2360148 162096031 68.68 30093369 12.75
## Ave.Word.Length
## 1 5.51
## 2 5.85
## 3 5.39
The summary statistics generated above gave good insight into the average properties of text from each data source. Next, it will be useful to see the distribution of the texts in terms of words per line.
twitter_words <- data.frame(Word.Count = twitter_wordcount, Category = rep("Twitter", nrow(as.data.frame(twitter_wordcount))))
blogs_words <- data.frame(Word.Count = blogs_wordcount, Category = rep("Blogs", nrow(as.data.frame(blogs_wordcount))))
news_words <- data.frame(Word.Count = news_wordcount, Category = rep("News", nrow(as.data.frame(news_wordcount))))
allsources <- rbind(twitter_words, blogs_words, news_words)
#Check Quartiles for each category to ensure we set useful axes on the histograms
summary(twitter_words)
## Word.Count Category
## Min. : 1.00 Twitter:2360148
## 1st Qu.: 7.00
## Median :12.00
## Mean :12.75
## 3rd Qu.:18.00
## Max. :47.00
summary(blogs_words)
## Word.Count Category
## Min. : 0.00 Blogs:899288
## 1st Qu.: 9.00
## Median : 28.00
## Mean : 41.75
## 3rd Qu.: 60.00
## Max. :6726.00
summary(news_words)
## Word.Count Category
## Min. : 1.00 News:1010242
## 1st Qu.: 19.00
## Median : 32.00
## Mean : 34.41
## 3rd Qu.: 46.00
## Max. :1796.00
ggplot(allsources, aes(y=Word.Count, x = Category)) + geom_boxplot() + coord_cartesian(ylim = c(0, 150))
ggplot(allsources, aes(Word.Count, ..density.., colour = Category)) +
geom_freqpoly(binwidth = 5) + coord_cartesian(xlim = c(0, 150))
Next, the data is cleaned to remove ‘stop words’ such as “you”, “ours” and “than” which will not add much value to this analysis, and then we will re-run the summary table and histogram to understand how this has changed the data.
en_blogs2 <- removeWords(tolower(en_blogs), stopwords("en"))
en_news2 <- removeWords(tolower(en_news), stopwords("en"))
en_twitter2 <- removeWords(tolower(en_twitter), stopwords("en"))
all_stats <- stri_stats_general(c(en_blogs2, en_news2, en_twitter2))
blogs_stats2 <- stri_stats_general(en_blogs2)
news_stats2 <- stri_stats_general(en_news2)
twitter_stats2 <- stri_stats_general(en_twitter2)
blogs_wordcount2 <- stri_count_words(en_blogs2)
news_wordcount2 <- stri_count_words(en_news2)
twitter_wordcount2 <- stri_count_words(en_twitter2)
textsummary2 <- data.frame(dataset = c("Blogs", "News", "Twitter"),
Line.Count = c(blogs_stats2[1], news_stats2[1], twitter_stats2[1]),
Char.Count = c(blogs_stats2[3], news_stats2[3], twitter_stats2[3]),
Char.Per.Line = c(round(blogs_stats2[3]/blogs_stats2[1],2), round(news_stats2[3]/news_stats2[1],2), round(twitter_stats2[3]/twitter_stats2[1],2)),
Word.Count = c(sum(blogs_wordcount2), sum(news_wordcount2), sum(twitter_wordcount2)),
Words.Per.Line = c(round(sum(blogs_wordcount2)/blogs_stats2[1],2), round(sum(news_wordcount2)/news_stats2[1],2), round(sum(twitter_wordcount2)/twitter_stats2[1],2)),
Ave.Word.Length = c(round(blogs_stats2[3]/sum(blogs_wordcount2),2), round(news_stats2[3]/sum(news_wordcount2),2), round(twitter_stats2[3]/sum(twitter_wordcount2),2)))
print(textsummary2)
## dataset Line.Count Char.Count Char.Per.Line Word.Count Words.Per.Line
## 1 Blogs 899288 156343875 173.85 19804208 22.02
## 2 News 1010242 162592852 160.94 20648155 20.44
## 3 Twitter 2360148 126645443 53.66 17460243 7.40
## Ave.Word.Length
## 1 7.89
## 2 7.87
## 3 7.25
Interestingly, twitter text appears to use fewer stop words than either news or blogs. This may be related to the low word count to begin with. Next, the histograms are regenerated to understand how the distribution of text lengths has changed.
twitter_words2 <- data.frame(Word.Count = twitter_wordcount2, Category = rep("Twitter", nrow(as.data.frame(twitter_wordcount2))))
blogs_words2 <- data.frame(Word.Count = blogs_wordcount2, Category = rep("Blogs", nrow(as.data.frame(blogs_wordcount2))))
news_words2 <- data.frame(Word.Count = news_wordcount2, Category = rep("News", nrow(as.data.frame(news_wordcount2))))
allsources2 <- rbind(twitter_words2, blogs_words2, news_words2)
#Check Quartiles for each category to ensure we set useful axes on the histograms
summary(twitter_words2)
## Word.Count Category
## Min. : 0.000 Twitter:2360148
## 1st Qu.: 5.000
## Median : 8.000
## Mean : 8.264
## 3rd Qu.:12.000
## Max. :47.000
summary(blogs_words2)
## Word.Count Category
## Min. : 0.0 Blogs:899288
## 1st Qu.: 6.0
## Median : 17.0
## Mean : 24.4
## 3rd Qu.: 35.0
## Max. :4490.0
summary(news_words2)
## Word.Count Category
## Min. : 0.00 News:1010242
## 1st Qu.: 12.00
## Median : 20.00
## Mean : 21.74
## 3rd Qu.: 29.00
## Max. :1796.00
ggplot(allsources2, aes(Word.Count, ..density.., colour = Category)) +
geom_freqpoly(binwidth = 5) + coord_cartesian(xlim = c(0, 150))
With the stop words removed, it will now be possible to analyze for word frequency and common pairs. This is done by creating a Corpus from all three data sets, and using the tm package to create and analyze a TermDocumentMatrix. Given the large size of the data sets, initially this will be done using a sampling of text from all three - Blogs, News, and Twitter.
blogs_sample <- en_blogs2[sample(1:length(en_blogs2),15000)]
blogs_sample <- removePunctuation(blogs_sample)
blogs_sample <- stripWhitespace(blogs_sample)
news_sample <- en_news2[sample(1:length(en_news2),15000)]
news_sample <- removePunctuation(news_sample)
news_sample <- stripWhitespace(news_sample)
twitter_sample <- en_twitter2[sample(1:length(en_twitter2),15000)]
twitter_sample <- removePunctuation(twitter_sample)
twitter_sample <- stripWhitespace(twitter_sample)
all_sample <- c(blogs_sample,news_sample,twitter_sample)
samplecorpus <- VCorpus(VectorSource(all_sample))
termdoc <- TermDocumentMatrix(samplecorpus)
#Find all words that appear at least 1000 times in our sample:
frequent <- findFreqTerms(termdoc, 1000)
print(frequent)
## [1] "also" "back" "can" "day" "even" "first" "get"
## [8] "going" "good" "got" "great" "home" "just" "know"
## [15] "last" "life" "like" "little" "love" "made" "make"
## [22] "many" "may" "much" "need" "new" "now" "one"
## [29] "people" "really" "right" "said" "say" "school" "see"
## [36] "still" "take" "things" "think" "time" "today" "two"
## [43] "want" "way" "well" "will" "work" "year" "years"
#Find associations to the most frequent word
findAssocs(termdoc, frequent[34], 0.1)
## $school
## 3017662928 boonsboro high smithsburg swingin
## 0.33 0.33 0.33 0.33 0.33
## hagerstown bandshell elementary 330 middle
## 0.32 0.23 0.20 0.19 0.18
## students district 430 230 jazz
## 0.18 0.17 0.16 0.15 0.15
## 530 schools 630 730 charter
## 0.14 0.14 0.13 0.13 0.13
## districts elementaryage prekgrade12 teacher franklin
## 0.12 0.12 0.12 0.12 0.10
## hicks teachers
## 0.10 0.10
The corpus can be created again, but this time in pairs of two words (bigrams) to gain an understanding of commonly occuring word pairs.
#Run a function that applies the ngram() tokenizer to all lines in the input
createbigrams <- function(x) {
unlist(
lapply(ngrams(words(x), 2), paste, collapse = " "),
use.names = FALSE
)
}
bigramdoc <- TermDocumentMatrix(samplecorpus,control=list(tokenize=createbigrams))
#Find all words that appear at least 1000 times in our sample:
frequent2 <- findFreqTerms(bigramdoc, 100)
print(frequent2)
## [1] "can get" "even though" "feel like"
## [4] "first time" "high school" "last night"
## [7] "last week" "last year" "looking forward"
## [10] "looks like" "make sure" "new jersey"
## [13] "new york" "right now" "st louis"
## [16] "two years" "united states" "years ago"
The n-gram analysis above will form the basis of the prediction model that will be built in the next stage of this project. Below is a brief overview of how this model will be built. It should be noted that while stop-words were removed here for data analysis, they will likely be used in the final model since they are a common and valid part of speech.