This report explores the major features of the data in three text files, drawn from US blogs, news and Twitter feeds, respectively, as a first step towards building a text prediction algorithm.
setwd("~/Documents/Coursera Data Science/Capstone/final/texts")
blogs <- readLines("en_US.blogs.txt")
news <- readLines("en_US.news.txt")
twitter <- readLines("en_US.twitter.txt")
First, let’s take a look at number of lines and words in each of the three files.
# Count number of lines
l1 <- length(blogs)
l2 <- length(news)
l3 <- length(twitter)
# Count number of words
library(stringi)
w1 <- stri_count(blogs,regex="\\S+")
ww1 <- sum(w1)
w2 <- stri_count(news,regex="\\S+")
ww2 <- sum(w2)
w3 <- stri_count(twitter,regex="\\S+")
ww3 <- sum(w3)
# Make a prop table of the total words and total lines
type <- c("blogs", "news", "twitter")
lines <- c(l1, l2, l3)
words <- c(ww1, ww2, ww3)
df <- as.data.frame(cbind(type, lines, words))
df$lines <- as.numeric(as.character(df$lines))
df$words <- as.numeric(as.character(df$words))
df$lines_prop <- paste0(round(df$lines/sum(df$lines)*100, 2), "%")
df$words_prop <- paste0(round(df$words/sum(df$words)*100, 2), "%")
df
## type lines words lines_prop words_prop
## 1 blogs 899288 37334131 21.06% 36.57%
## 2 news 1010242 34372530 23.66% 33.67%
## 3 twitter 2360148 30373543 55.28% 29.75%
Blogs, news, and Twitter all occuppy about the same % of words, but Twitter represents a full half of the total lines.
Next, let’s get summary statistics and distributions of the number of words per line for the three files.
options(scipen=999)
library(ggplot2)
# Plot number of words per line for blogs
ggplot() +
aes(w1)+
geom_histogram(binwidth=1, colour="black", fill="white") +
ggtitle("Blogs Words Per Line") +
xlab("# Words per Line") +
ylab("Freq")
# Summary stats for blogs
summary(w1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 9.00 28.00 41.52 59.00 6630.00
# Plot number of words per line for news
ggplot() +
aes(w2)+
geom_histogram(binwidth=1, colour="black", fill="white") +
ggtitle("News Words Per Line") +
xlab("# Words per Line") +
ylab("Freq")
# Summary stats for news
summary(w2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 31.00 34.02 45.00 1792.00
# Plot number of words per line for Twitter
ggplot() +
aes(w3)+
geom_histogram(binwidth=1, colour="black", fill="white") +
ggtitle("Twitter Words Per Line") +
xlab("# Words per Line") +
ylab("Freq")
# Summary stats for Twitter
summary(w3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.87 18.00 47.00
All three files exhibit a long-tail distribution of words per line, although there are far less outliers for Twitter than for news or blogs. This means that when we sample lines of the data to be able to build our model, respecting proportionality of lines in the sample will not necessarily translate to the same proportionality of words.
Despite this caveat, let’s go ahead and sample the files to create our dataset, 25% of the lines for blogs, 25% of the lines for news, and half of the lines for Twitter.
# Create samples
set.seed(123)
bl <- sample(blogs, 100000)
set.seed(345)
ne <- sample(news, 100000)
set.seed(567)
tw <- sample(twitter, 200000)
# Bind them into one dataset
data <- c(bl, ne, tw)
Let’s process our data to make it easier to work with. Typically in NLP models, stopwords such as “the”, “a” and “from” are removed because they don’t have analytic value. However, since we will be building a model to predict all words, including stopwords, we will keep them in the dataset.
# Make all lower case
data <- tolower(data)
# Remove all non-letter characters, non-whitespace characters, except for apostrophes
library(stringr)
data <- str_replace_all(data, "[^[:alpha:][:space:]’']", " ")
# Replace one or more spaces with one space
data <- str_replace_all(data, "\\s+", " ")
# Trim leading and trailing whitespace
data <- str_trim(data)
Let’s make sure our model doesn’t end up suggesting bad words by taking them all out.
# Load profanity data set
# The dataset was found at http://fffff.at/googles-official-list-of-bad-words/
setwd("~/Documents/Coursera Data Science/Capstone/final/texts")
profanity <- readLines("profanity_list.txt")
# Remove them
library(tm)
clean_data <- removeWords(data, profanity)
Tokenizing the dataset means breaking up each line into word “chains” of a certain number of consecutive words. We’ll do this for single words, all the way up to four words, known as a quadrigram.
library("tokenizers")
grams1 <- unlist (tokenize_ngrams(clean_data, n = 1, n_min = 1))
grams2 <- unlist (tokenize_ngrams(clean_data, n = 2, n_min = 2))
grams3 <- unlist (tokenize_ngrams(clean_data, n = 3, n_min = 3))
grams4 <- unlist (tokenize_ngrams(clean_data, n = 4, n_min = 4))
Now let’s plot the Top 20 most recurring word grams for each set (1-4).
library(ggplot2)
df1 <- as.data.frame(table(grams1))
df_sort1 <- df1[order(-df1$Freq),]
top_20_1 <- head(df_sort1, 20)
p1 <- ggplot(data=top_20_1, aes(x=grams1, y=Freq)) +
geom_bar(stat="identity") +
xlab("Words") +
ggtitle("Top 20 Words") +
theme(axis.text.x = element_text(angle = 45, size=8, hjust = 1))
p1
df2 <- as.data.frame(table(grams2))
df_sort2 <- df2[order(-df2$Freq),]
top_20_2 <- head(df_sort2, 20)
p2 <- ggplot(data=top_20_2, aes(x=grams2, y=Freq)) +
geom_bar(stat="identity") +
xlab("Bigrams") +
ggtitle("Top 20 Bigrams") +
theme(axis.text.x = element_text(angle = 45, size=8, hjust = 1))
p2
df3 <- as.data.frame(table(grams3))
df_sort3 <- df3[order(-df3$Freq),]
top_20_3 <- head(df_sort3, 20)
p3 <- ggplot(data=top_20_3, aes(x=grams3, y=Freq)) +
geom_bar(stat="identity") +
xlab("Trigrams") +
ggtitle("Top 20 Trigrams") +
theme(axis.text.x = element_text(angle = 45, size=8, hjust = 1))
p3
df4 <- as.data.frame(table(grams4))
df_sort4 <- df4[order(-df4$Freq),]
top_20_4 <- head(df_sort4, 20)
p4 <- ggplot(data=top_20_4, aes(x=grams4, y=Freq)) +
geom_bar(stat="identity") +
xlab("Quadrigrams") +
ggtitle("Top 20 Quadrigrams") +
theme(axis.text.x = element_text(angle = 45, size=8, hjust = 1))
p4
Because these datasets are so large, it’s important to explore how we might be able to reduce the dataset without sacrificing accuracy (or at least, much accuracy). For instance, how many words, ordered in terms of frequency, would it take to cover 90% of the text?
# Create prop table for individual words and total text
df_sort1 <- df1[order(-df1$Freq),]
df_sort1$Text_Prop <- prop.table( df_sort1$Freq )
df_sort1$Text_CumProp <- round(cumsum( df_sort1$Text_Prop ),2)
df_sort1$Word_Count <- 1
df_sort1$Cum_Word_Count <- cumsum(df_sort1$Word_Count)
df_sort1$Word_Prop <- prop.table( df_sort1$Word_Count)
df_sort1$Word_CumProp <- round(cumsum( df_sort1$Word_Prop),2)
# Clean
df_sort1 <- df_sort1[,c(1,2,6,4,8)]
# Print results
head(df_sort1[df_sort1$Text_CumProp==0.9, ],1)
## grams1 Freq Cum_Word_Count Text_CumProp Word_CumProp
## 166486 whipped 110 6872 0.9 0.04
Not that many!
Let’s plot the relationship between word and text coverage to better understand what words we might be able to remove from the dataset.
ggplot(data=df_sort1, aes(x=Word_CumProp, y=Text_CumProp)) +
geom_line() +
xlab("Text Coverage") +
ylab("Word Coverage") +
ggtitle("Relationship Between Word and Text Coverage 1")
ggplot(data=df_sort1, aes(x=Text_CumProp, y=Cum_Word_Count)) +
geom_line() +
xlab("Text Coverage") +
ylab("Number of Words") +
ggtitle("Relationship Between Word and Text Coverage 2")
These plots reiterate the fact that the gains we get in total text coverage beyond the 10,000 most fequent words are limited.
Based on what we learned, there are several avenues to explore to create an accurate, fast model based on our clean dataset.
Onwards!