Pandatas December 2019
The Data Science Specialization Capstone Project consists of a data science project to develop a predictive model of text using multiple text datasets. This milestone report describes how the provided datasets were loaded, cleaned, and sampled. It also contains some basic exploratory analysis of the data and a short summary on the plans for the predictive model and Shiny App.
In this Section, the R libraries and the provided data will be loaded.
Loading the libraries:
# Loading libraries
library(tm)
## Loading required package: NLP
library(ngram)
library(RColorBrewer)
library(readtext)
library(pryr)
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
##
## Attaching package: 'pryr'
## The following object is masked from 'package:tm':
##
## inspect
library(rJava)
library(RWeka)
library(wordcloud)
library(quanteda)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
Downloading and unzipping the data:
# Downloading the provided data
fileURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(fileURL, "final.zip")
# Unzip the data
unzip("final.zip")
The data was read in R and some basic statistics were run on the provided data:
#Get file sizes
size_blogs <- file.info("./final/en_US/en_US.blogs.txt")$size /1024^2
size_news <- file.info("./final/en_US/en_US.news.txt")$size /1024^2
size_twitter <- file.info("./final/en_US/en_US.twitter.txt")$size/1024^2
# Word and line count blogs file
blogs<-file("./final/en_US/en_US.blogs.txt","r")
blogs_lines<-readLines(blogs)
close(blogs)
wordcount(blogs_lines)
## [1] 37334131
length(blogs_lines)
## [1] 899288
# Word and line count news file
news<-file("./final/en_US/en_US.news.txt","r")
news_lines<-readLines(news)
close(news)
wordcount(news_lines)
## [1] 34372530
length(news_lines)
## [1] 1010242
# Word and line count twitter file
twitter<-file("./final/en_US/en_US.twitter.txt","r")
twitter_lines<-readLines(twitter)
close(twitter)
wordcount(twitter_lines)
## [1] 30373543
length(twitter_lines)
## [1] 2360148
# Summary of the data sets
data.frame(Dataset = c("Blogs", "News", "Twitter"),
filesize.in.MB = c(size_blogs, size_news, size_twitter),
Linecount = c(length(blogs_lines), length(news_lines), length(twitter_lines)),
wordscount = c(wordcount(blogs_lines), wordcount(news_lines), wordcount(twitter_lines))
)
## Dataset filesize.in.MB Linecount wordscount
## 1 Blogs 200.4242 899288 37334131
## 2 News 196.2775 1010242 34372530
## 3 Twitter 159.3641 2360148 30373543
Then the dataset was sampled and cleaned removing the following: * White space * Punctuation * Upper case letters (converting to lower case) * Numbers * Stopwords
# Subsample and prepare corpus
set.seed(12345) # Ensuring a reproducible sample set
samplesize <- 0.01 # Subsample to 1%
corpussample <- c(sample(blogs_lines, length(blogs_lines) * samplesize),
sample(news_lines, length(news_lines) * samplesize),
sample(twitter_lines, length(twitter_lines) * samplesize))
corpussample <- VCorpus(VectorSource(corpussample))
corpussample <- tm_map(corpussample, stripWhitespace) # Remove unnecessary white spaces
corpussample <- tm_map(corpussample, removePunctuation) # Remove punctuation
corpussample <- tm_map(corpussample, content_transformer(tolower)) # Convert to lowercase
corpussample <- tm_map(corpussample, removeNumbers) # Remove numbers
corpus <- tm_map(corpussample, removeWords, stopwords("english")) # Remove stopwords
To explore the data, a wordcloud was produced to get an impression of frequently used words in the corpus:
wordcloud(corpus, scale=c(3,0.5), min.freq=5, max.words=100, random.order=TRUE, rot.per=0.5, colors=brewer.pal(5, "Set1"), use.r.layout=FALSE)
The corpus was analyzed to determine the word frequencies of unigrams, bigrams and trigrams:
corpus.dataframe <- data.frame(text = unlist(sapply(corpus, '[', 'content')), stringsAsFactors = F)
# Unigram
unigramtoken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 1, max = 1))))
unigram <- unigramtoken[order(unigramtoken$Freq, decreasing = TRUE),]
par(mfrow = c(1, 1))
par(mar=c(5,4,2,0))
barplot(unigram[1:20,2],
names.arg=unigram[1:20,1],
col = "blue",
main="Top 20 of unigrams",
las=2,
ylab = "Frequency")
# Bigram
bigramtoken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 2, max = 2))))
bigram <- bigramtoken[order(bigramtoken$Freq, decreasing = TRUE),]
par(mfrow = c(1, 1))
par(mar=c(5,4,2,1))
barplot(bigram[1:20,2],
names.arg=bigram[1:20,1],
col = "blue",
main="Top 20 of bigrams",
las=2,
ylab = "Frequency")
# Trigram
trigramtoken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 3, max = 3))))
trigram <- trigramtoken[order(trigramtoken$Freq, decreasing = TRUE),]
par(mfrow = c(1, 1))
par(mar=c(9,4,2,1))
barplot(trigram[1:20,2],
names.arg=trigram[1:20,1],
col = "blue",
main="Top 20 of trigrams",
las=2,
ylab = "Frequency")
Following items will need to be addressed for the development of the prediction algorithm and Shiny App: * further optimize data cleaning * develop training and test sets from the data * look at the trade-off between size and runtime of the prediction algorithm * develop prediction algorithm * develop Shiny App