twitter_conn <- file('data/en_US/en_US.twitter.txt')
news_conn <- file('data/en_US/en_US.news.txt')
blogs_conn <- file('data/en_US/en_US.blogs.txt')
open(twitter_conn)
open(news_conn)
open(blogs_conn)
twitter_data <- readLines(twitter_conn, skipNul=TRUE)
blogs_data <- readLines(blogs_conn, skipNul=TRUE)
news_data <- readLines(news_conn, skipNul=TRUE)
close(twitter_conn)
close(blogs_conn)
close(news_conn)
length(twitter_data)
## [1] 2360148
max(sapply(twitter_data, nchar))
## [1] 140
length(blogs_data)
## [1] 899288
max(sapply(blogs_data, nchar))
## [1] 40833
length(news_data)
## [1] 1010242
max(sapply(news_data, nchar))
## [1] 11384
sum(grepl('love', twitter_data)) / sum(grepl('hate', twitter_data))
## [1] 4.108592
head(twitter_data[grepl('biostats', twitter_data)])
## [1] "i know how you feel.. i have biostats on tuesday and i have yet to study =/"
sum(grepl('A computer once beat me at chess, but it was no match for me at kickboxing', twitter_data))
## [1] 3
The first step I want to perform is normalizing words in the datasets:
* Transform words to lowercase
* Remove all punctuation
* Words are separated by only spaces
normalize_paragraph <- function(paragraph) {
remove_non_alphabetical <- function(sentence) {
sentence <- strsplit(sentence, '')[[1]]
sentence <- sentence[sentence >= 'a' & sentence <= 'z' | sentence == ' ']
paste(sentence, collapse='')
}
n_sentences <- length(paragraph)
answer <- list()
counter <- 0
for (i in 1:n_sentences) {
if (class(paragraph[i]) == 'character') {
paragraph[i] <- tolower(paragraph[i])
paragraph[i] <- remove_non_alphabetical(paragraph[i])
words <- strsplit(paragraph[i], ' ')[[1]]
counter <- counter + 1
answer[[counter]] <- words[nchar(words) > 0]
}
}
answer
}
news_data <- normalize_paragraph(news_data)
blogs_data <- normalize_paragraph(blogs_data)
twitter_data <- normalize_paragraph(twitter_data)
combined_paragraph <- do.call(c, list(blogs_data, news_data, twitter_data))
# rm(blogs_data, news_data, twitter_data)
length(combined_paragraph)
## [1] 4269678
sum(sapply(combined_paragraph, length))
## [1] 99639414
mean(sapply(combined_paragraph, length))
## [1] 23.33652
library(hash)
## hash-2.2.6.1 provided by Decision Patterns
dict <- hash()
increase_count <- function(word) {
if (length(word) > 1) {
word <- paste(word, ' ')
}
if (has.key(word, dict)) {
dict[[word]] <<- dict[[word]] + 1
} else {
dict[[word]] <<- 1
}
}
for (sentence in combined_paragraph) {
for (word in sentence) {
increase_count(word)
}
}
one_gram_distribution <- unlist(as.list(dict))
one_gram_distribution <- sort(one_gram_distribution, decreasing=TRUE)
head(one_gram_distribution)
## the to and a of in
## 4761104 2753596 2404075 2381066 2005576 1644687
library(ggplot2)
first_20_words <- one_gram_distribution[1:20]
first_20_words <- data.frame(word=factor(names(first_20_words), levels=names(first_20_words)), occurrences=unname(first_20_words))
ggplot(first_20_words, aes(x=word, y=occurrences, fill=word)) + geom_bar(stat="identity")