In this report, we: 1. Demonstrate the loading of English language datasets into R 2. Examine the data and report summary statistics 3. Perform exploratory data analysis, and 4. Discuss future plans for transforming these preliminary findings into a prediction algorithm and app.
Sometimes, the job of a data scientist is to walk the line between speed and accuracy. To develop a preditive model for text selection, we erred on the side of speed (especially at this early stage) and loaded in the first 50,000 lines of the twitter, blogs, and news English datasets.
The size of the files (in MB) is:
twitter_size <- file.info("en_US.twitter.txt")$size/1000000
blogs_size <- file.info("en_US.blogs.txt")$size/1000000
news_size <- file.info("en_US.news.txt")$size/1000000
We then utilize the stringi library to obtain word and line counts, and organize the data into a table.
## data_source file_size_mb num_lines num_words mean_words
## 1 twitter NA 2360148 30218166 12.80350
## 2 blogs NA 899288 38154238 42.42716
## 3 news NA 766277 26565301 34.66801
As we might expect, twitter has fewer average words per line than either blogs or news, due to their 140 character limit. However, the twitter data have more total lines, which also makes sense.
In the interest of saving time, we performed the processing steps on a 1% subset of the data.
We removed special characters and expanded some contractions before creating a corpus. Since we’re making predictions on which word usually comes next, we left the stopwords in.
Next we create a corpus and process it into sentences, removing numbers, punctuation, profanity, and whitespace. Since we want every instance of each word to be the same, we also make everything lowercase.
# Create a corpus
corpus <- Corpus(VectorSource(all_samp))
# preprocess the data: remove puctuation and numbers, convert to all lowercase
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus,content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
Now that we’ve gotten a clean(er) dataset, we can perform premilinary n-gram/tokenization analysis.
We determine the most commonly used words with term frequency analysis, and plot the most common uni-, bi-, and tri-grams.
tf <- termFreq(PlainTextDocument(corpus))
qplot(tf, geom="histogram", binwidth = 1) + xlim(0,45) + xlab("Words") + ylab("Frequency")
## Warning: Removed 1987 rows containing non-finite values (stat_bin).
print(tail(sort(tf),n=10))
## txt
## mon origin description min heading
## 40258 40260 40275 40279 40305
## language the hour year character(0),
## 40312 40342 40419 41113 201275
## word frequency
## the the 42351
## and and 21701
## for for 10164
## you you 9914
## that that 9517
## not not 7649
## with with 6587
## have have 5814
## was was 5749
## are are 5590
## phrase frequency
## of the of the 3788
## in the in the 3597
## i am i am 2567
## to the to the 1899
## for the for the 1855
## on the on the 1760
## to be to be 1524
## do not do not 1407
## i have i have 1378
## at the at the 1260
## phrase frequency
## i do not i do not 465
## one of the one of the 303
## a lot of a lot of 279
## thanks for the thanks for the 253
## i am not i am not 207
## to be a to be a 186
## going to be going to be 185
## i have been i have been 177
## i can not i can not 171
## i did not i did not 165
We will develop a text prediction model using the bigram and trigram data obtained from this 1% sample of the entire corpus. The n-grams can be used to calculate the probability of each subsequent word. The prediction models will be turned into a Shiny app once they are proven to be accurate.
# load required libraries
suppressMessages(suppressWarnings(library(tm)))
suppressMessages(suppressWarnings(library(RWeka)))
suppressMessages(suppressWarnings(library(ggplot2)))
suppressMessages(suppressWarnings(library(stringi)))
suppressMessages(suppressWarnings(library(slam)))
suppressMessages(suppressWarnings(library(knitr)))
setwd("C:/Users/Rachel/Documents/R Programming/capstone/en_US")
#read in twitter, blogs, and news datasets
conn <- file("en_US.twitter.txt", open="r")
twitter <- readLines(con = conn, -1L, skipNul = TRUE, warn = FALSE)
close(conn)
conn <- file("en_US.blogs.txt", open="r")
blogs <- readLines(con = conn, -1L, skipNul = TRUE, warn = FALSE)
close(conn)
conn <- file("en_US.news.txt", open="r")
news <- readLines(con = conn, -1L, skipNul = TRUE, warn = FALSE)
close(conn)
twitter_size <- file.info("en_US.twitter.txt")$size/1000000
blogs_size <- file.info("en_US.blogs.txt")$size/1000000
news_size <- file.info("en_US.news.txt")$size/1000000
twitter_words <- stri_count_words(twitter)
blog_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
summ_table <- data.frame(data_source = c("twitter","blogs","news"),
file_size_mb = c(twitter_size, blogs_size, news_size),
num_lines = c(length(twitter),length(blogs),length(news)),
num_words = c(sum(twitter_words),sum(blog_words),sum(news_words)),
mean_words = c(mean(twitter_words),mean(blog_words),mean(news_words)))
summ_table
# Take 1% sample
set.seed(1333)
twitter_samp <- sample(twitter, length(twitter) * 0.01)
blogs_samp <- sample(blogs, length(blogs) * 0.01)
news_samp <- sample(news, length(news) * 0.01)
# Combine samples into a single dataset
all_samp <- c(twitter_samp, blogs_samp, news_samp)
# remove special characters
all_samp <- iconv(all_samp, "latin1", "ASCII", sub="")
# remove contractions
all_samp <- gsub("can't", "can not", all_samp, perl = TRUE)
all_samp <- gsub("'m", " am", all_samp, perl = TRUE)
all_samp <- gsub("let's", "let us", all_samp, perl = TRUE)
all_samp <- gsub("'re", " are", all_samp, perl = TRUE)
all_samp <- gsub("'ve", " have", all_samp, perl = TRUE)
all_samp <- gsub("'d", " had", all_samp, perl = TRUE)
all_samp <- gsub("'ll", " will", all_samp, perl = TRUE)
all_samp <- gsub("n't", " not", all_samp, perl = TRUE)
# Create a corpus
corpus <- Corpus(VectorSource(all_samp))
# preprocess the data: remove puctuation and numbers, convert to all lowercase
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus,content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
# remove profanities - favorite part of this project
profanewords <- c("shit", "fuck", "ass", "cunt", "dick", "damn", "hell", "motherfucker")
corpus <- tm_map(corpus, removeWords, profanewords)
# create tdm
#tdm <- TermDocumentMatrix(corpus)
# inspect a random list of ten words
#inspect(tdm[100:110,1])
tf <- termFreq(PlainTextDocument(corpus))
qplot(tf, geom="histogram", binwidth = 1) + xlim(0,45) + xlab("Words") + ylab("Frequency")
print(tail(sort(tf),n=10))
# n-gram analysis, where n= 1,2,3
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
tdm1g <- TermDocumentMatrix(corpus, control = list(tokenize = unigram))
tdm1g.good <- rollup(tdm1g, 2, na.rm=TRUE, FUN = sum)
unigram.tf <- findFreqTerms(tdm1g.good, lowfreq = 10)
unigram.tf <- sort(rowSums(as.matrix(tdm1g.good[unigram.tf, ])), decreasing = TRUE)
unigram.tf <- data.frame(tdm1g.good=names(unigram.tf), frequency=unigram.tf)
names(unigram.tf) <- c("word", "frequency")
head(unigram.tf, 10)
#unigram plot
g <- ggplot(data = head(unigram.tf, 10), aes(x = word, y = frequency))
g <- g + geom_bar(stat="Identity", fill="blue", colour = "black")
g <- g + geom_text(aes(label=frequency), vjust=-0.1)
g <- g + theme(axis.text.x = element_text(angle = 90, hjust = 1))
g
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm2g <- TermDocumentMatrix(corpus, control = list(tokenize = bigram))
tdm2g.good <- rollup(tdm2g, 2, na.rm=TRUE, FUN = sum)
bigram.tf <- findFreqTerms(tdm2g.good, lowfreq = 10)
bigram.tf <- sort(rowSums(as.matrix(tdm2g.good[bigram.tf, ])), decreasing = TRUE)
bigram.tf <- data.frame(tdm1g.good=names(bigram.tf), frequency=bigram.tf)
names(bigram.tf) <- c("phrase", "frequency")
head(bigram.tf, 10)
#bigram plot
h <- ggplot(data = head(bigram.tf, 10), aes(x = phrase, y = frequency))
h <- h + geom_bar(stat="Identity", fill="red", colour = "black")
h <- h + geom_text(aes(label=frequency), vjust=-0.1)
h <- h + theme(axis.text.x = element_text(angle = 90, hjust = 1))
h
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm3g <- TermDocumentMatrix(corpus, control = list(tokenize = trigram))
tdm3g.good <- rollup(tdm3g, 2, na.rm=TRUE, FUN = sum)
trigram.tf <- findFreqTerms(tdm3g.good, lowfreq = 10)
trigram.tf <- sort(rowSums(as.matrix(tdm3g.good[trigram.tf, ])), decreasing = TRUE)
trigram.tf <- data.frame(tdm1g.good=names(trigram.tf), frequency=trigram.tf)
names(trigram.tf) <- c("phrase", "frequency")
head(trigram.tf, 10)
# trigram plot
m <- ggplot(data = head(trigram.tf, 10), aes(x = phrase, y = frequency))
m <- m + geom_bar(stat="Identity", fill="green", colour = "black")
m <- m + geom_text(aes(label=frequency), vjust=-0.1)
m <- m + theme(axis.text.x = element_text(angle = 90, hjust = 1))
m