This report is part of the Coursera Data Science Specialization Capstone project for Week 2 of the Course. The goal for this project is to: 1. Demonstrate that I’ve downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets.
setwd("U:/EA/Capstone")
# install.packages("RWeka") if not already loaded
# install.packages("textcat") if not already loaded
library(tm);
## Loading required package: NLP
library(RWeka);
library(textcat);
library(ggplot2);
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
# the files have already been downloaded
twitter<-readLines('en_US.twitter.txt')
blogs<-readLines('en_US.blogs.txt')
news<-readLines('en_US.news.txt')
This gives a general idea of the size of the files
# This answers the Review Criteria, "Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables?"
# install.packages("stringi") if not already installed
library(stringi)
# Determine file sizes
blogssize <- file.info("U:/EA/Capstone/en_US.blogs.txt")$size / 1024^2
newssize <- file.info("U:/EA/Capstone/en_US.news.txt")$size / 1024^2
twittersize <- file.info("U:/EA/Capstone/en_US.twitter.txt")$size / 1024^2
# Determine number of words in files
blogswords <- stri_count_words(blogs)
newswords <- stri_count_words(news)
twitterwords <- stri_count_words(twitter)
# Summary of the data sets
data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(blogssize, newssize, twittersize),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(blogswords), sum(newswords), sum(twitterwords)),
mean.num.words = c(mean(blogswords), mean(newswords), mean(twitterwords)))
## source file.size.MB num.lines num.words mean.num.words
## 1 blogs 200.4242 899288 38154238 42.42716
## 2 news 196.2775 77259 2693898 34.86840
## 3 twitter 159.3641 2360148 30218125 12.80349
Create a sample size and remove special characters, URLs, numbers, punctuation, etc.
library(tm) # Loading the text mining package
# Sample the data
set.seed(800)
data.sample <- c(sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01))
# Create corpus and clean the data
corpus <- VCorpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") # removes URLs
corpus <- tm_map(corpus, toSpace, "@[^\\s]+") # removes special characters
corpus <- tm_map(corpus, tolower) # changes to lower case
corpus <- tm_map(corpus, removeWords, stopwords("en")) # filters most common words
corpus <- tm_map(corpus, removePunctuation) # removes punctuation
corpus <- tm_map(corpus, removeNumbers) # removes numbers
corpus <- tm_map(corpus, stripWhitespace) # compresses whitespace character to a single blank
corpus <- tm_map(corpus, PlainTextDocument) # creates plain text documents
Create functions Word count and word combination frequencies.
library(ggplot2)
options(mc.cores=1)
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
makePlot <- function(data, label) {
ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("blue"))
}
# Get frequencies of most common n-grams in data sample
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus), 0.99))
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.999))
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
Histogram of the 30 most common words
makePlot(freq1, "30 Most Common Words")
Histogram of the 30 Most Common Two-Word Combinations
makePlot(freq2, "Most Common Two-Word Combinations")
Histogram of the 30 Most Common Three-word Combinations
makePlot(freq3, "Most Common Three-Word Combinations")
Reading the data sets was time-consuming because of the size. RWeka was difficult to load but necessary to extract the frequency counts.The goal of this project was just to display that I’ve gotten used to working with the data in order to be on track to create a prediction algorithm.