This report summarizes an exloratory analysis of Capstone Dataset of Cousera Data Science Capstone Project.
Download Capstone Dataset and extract it to a folder called dataset
echo "Downloading Coursera-SwiftKey.zip ..."
rm Coursera-SwiftKey.zip
curl -O https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
echo "Unziping Coursera-SwiftKey.zip..."
rm -r dataset
mkdir dataset
tar xvf Coursera-SwiftKey.zip -C `pwd`/dataset --strip-components 1
Under dataset there are 4 folders for 4 languages. Each folder contains 3 txt files of blogs, news, and tweets.
find dataset -maxdepth 1 -exec ls -l "{}" \; | awk '{print $9}'
##
## de_DE
## en_US
## fi_FI
## ru_RU
##
## de_DE.blogs.txt
## de_DE.news.txt
## de_DE.twitter.txt
##
## en_US.blogs.txt
## en_US.news.txt
## en_US.twitter.txt
## tokenized_en_US.blogs.txt
## tokenized_en_US.news.txt
## tokenized_en_US.twitter.txt
##
## fi_FI.blogs.txt
## fi_FI.news.txt
## fi_FI.twitter.txt
##
## ru_RU.blogs.txt
## ru_RU.news.txt
## ru_RU.twitter.txt
Here I use Standford Tokenizer from The Stanford Natural Language Processing Group.The Stanford Tokenizer is not distributed separately but is included in several of their software downloads, including the Stanford Parser, Stanford Part-of-Speech Tagger, Stanford Named Entity Recognizer, and Stanford CoreNLP.
echo "Downloading Stanford parser ..."
curl -O http://nlp.stanford.edu/software/stanford-parser-full-2015-01-29.zip
rm -r stanford
mkdir stanford
tar xvf stanford-parser-full-2015-01-29.zip -C `pwd`/stanford --strip-components 1
Here I use the Stanford Tokenizer to tokenize tweets,news and blogs under dataset/en_US/
twitter="en_US.twitter.txt"
blogs="en_US.blogs.txt"
news="en_US.news.txt"
function statistics(){
echo "File Info:"
echo " File name: $1"
filesze=`ls -lah dataset/* | grep -w $1 | awk '{print $5}'`
echo " FileSize: $filesze"
file=`find . -name $1`
nlines=`wc -l $file | awk '{print $1}'`
echo " Number of lines: $nlines"
nwords=`wc -w $file | awk '{print $1}'`
echo " Number of words: $nwords"
}
function tokenization(){
file=`find . -name $1`
output="dataset/en_US/tokenized_`basename $file`"
echo "Tokenizing $file..."
export CLASSPATH=stanford/stanford-parser.jar
java edu.stanford.nlp.process.PTBTokenizer -preserveLines -options untokenizable=noneDelete,normalizeParentheses=false,normalizeOtherBrackets=false < $file > $output
echo "Tokenization data generated successfully: tokenized_`basename $file`"
}
tokenizedfiles=()
for src in $twitter $blogs $news
do
echo "======================================================"
echo "Processing $src..."
statistics $src
tokenization $src
statistics "tokenized_$src"
echo "======================================================"
tokenizedfiles+=("tokenized_$src")
done
echo "Tokenized files generated under dataset/en_US/"
echo ${tokenizedfiles[@]}
To explore more valueable information from the data. Here I remove punctuations, numbers, stopwords and stemming phases.
library(tm)
library(SnowballC)
library(slam)
library(RWeka)
library(RColorBrewer)
library(wordcloud)
library(reshape2)
library(rCharts)
options(
rcharts.mode = 'iframesrc',
rcharts.cdn = TRUE,
RCHART_WIDTH = 600,
RCHART_HEIGHT = 400
)
library(knitr)
opts_chunk$set(tidy = F, results = 'asis', comment = NA)
options(mc.cores=1)
explore <- function(file,sample_size){
# Read file
data <- readLines(file,n=sample_size)
# Remove punctuations, numbers, stopwords and stemming phases,convert to lower cases for exploratory purpos
corpus <- VCorpus(VectorSource(data))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, content_transformer(tolower))
#corpus <- tm_map(corpus, stemDocument)
# define tokenization functions for 1-, 2- and 3-grams
uni_tokenizer <- function(t) NGramTokenizer(t, Weka_control(min = 1, max = 1))
bi_tokenizer <- function(t) NGramTokenizer(t, Weka_control(min = 2, max = 2))
tri_tokenizer <- function(t) NGramTokenizer(t, Weka_control(min = 3, max = 3))
# create 1-,2-,3-,gram
unigram <- TermDocumentMatrix(corpus, control=list(tokenize = uni_tokenizer))
bigram <- TermDocumentMatrix(corpus,control=list(tokenize = bi_tokenizer))
trigram <- TermDocumentMatrix(corpus, control=list(tokenize = tri_tokenizer))
#Removing sparse terms
removeSparseTerms(unigram, 0.6)
removeSparseTerms(bigram, 0.6)
removeSparseTerms(trigram, 0.6)
# Word Cloud
wordCloud(unigram)
wordCloud(bigram)
wordCloud(trigram)
# Bar plot
barPlot(unigram,"unigram")
barPlot(bigram,"bigram")
barPlot(trigram,"trigram")
}
wordCloud <- function (x){
v <- sort(row_sums(x), decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
wordcloud(d$word,d$freq,max.words=50, colors=brewer.pal(8,"Dark2"),
scale=c(2,0.2))
}
barPlot <- function(x,name){
v <- sort(row_sums(x), decreasing=TRUE)
d <- data.frame(Word = names(v),Frequency=v)
d <- d[with(d, order(-Frequency)),]
rp1 = rPlot(x = "Word", y = "Frequency", data = d[1:50,], type = "bar",title=name)
rp1$show('iframesrc', cdn = TRUE)
}
explore("./dataset/en_US/en_US.blogs.txt",500000)
explore("./dataset/en_US/en_US.news.txt",500000)
explore("./dataset/en_US/en_US.twitter.txt",500000)
Warning in readLines(file, n = sample_size): line 167155 appears to
contain an embedded nul
Warning in readLines(file, n = sample_size): line 268547 appears to
contain an embedded nul
explore("./dataset/en_US/tokenized_en_US.blogs.txt",500000)
explore("./dataset/en_US/tokenized_en_US.news.txt",500000)
explore("./dataset/en_US/tokenized_en_US.twitter.txt",500000)